diff --git a/mlir/include/mlir/Conversion/VectorToSCF/ProgressiveVectorToSCF.h b/mlir/include/mlir/Conversion/VectorToSCF/ProgressiveVectorToSCF.h deleted file mode 100644 --- a/mlir/include/mlir/Conversion/VectorToSCF/ProgressiveVectorToSCF.h +++ /dev/null @@ -1,71 +0,0 @@ -//===- ProgressiveVectorToSCF.h - Convert vector to SCF dialect -*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef MLIR_CONVERSION_VECTORTOSCF_PROGRESSIVEVECTORTOSCF_H_ -#define MLIR_CONVERSION_VECTORTOSCF_PROGRESSIVEVECTORTOSCF_H_ - -#include "mlir/IR/PatternMatch.h" - -namespace mlir { -class MLIRContext; -class Pass; -class RewritePatternSet; - -/// When lowering an N-d vector transfer op to an (N-1)-d vector transfer op, -/// a temporary buffer is created through which individual (N-1)-d vector are -/// staged. This pattern can be applied multiple time, until the transfer op -/// is 1-d. -/// This is consistent with the lack of an LLVM instruction to dynamically -/// index into an aggregate (see the Vector dialect lowering to LLVM deep dive). -/// -/// An instruction such as: -/// ``` -/// vector.transfer_write %vec, %A[%a, %b, %c] : -/// vector<9x17x15xf32>, memref -/// ``` -/// Lowers to pseudo-IR resembling (unpacking one dimension): -/// ``` -/// %0 = alloca() : memref> -/// store %vec, %0[] : memref> -/// %1 = vector.type_cast %0 : -/// memref> to memref<9xvector<17x15xf32>> -/// affine.for %I = 0 to 9 { -/// %dim = dim %A, 0 : memref -/// %add = affine.apply %I + %a -/// %cmp = cmpi "slt", %add, %dim : index -/// scf.if %cmp { -/// %vec_2d = load %1[%I] : memref<9xvector<17x15xf32>> -/// vector.transfer_write %vec_2d, %A[%add, %b, %c] : -/// vector<17x15xf32>, memref -/// ``` -/// -/// When applying the pattern a second time, the existing alloca() operation -/// is reused and only a second vector.type_cast is added. - -struct ProgressiveVectorTransferToSCFOptions { - bool unroll = false; - ProgressiveVectorTransferToSCFOptions &setUnroll(bool u) { - unroll = u; - return *this; - } -}; - -/// Collect a set of patterns to convert from the Vector dialect to SCF + std. -void populateProgressiveVectorToSCFConversionPatterns( - RewritePatternSet &patterns, - const ProgressiveVectorTransferToSCFOptions &options = - ProgressiveVectorTransferToSCFOptions()); - -/// Create a pass to convert a subset of vector ops to SCF. -std::unique_ptr createProgressiveConvertVectorToSCFPass( - const ProgressiveVectorTransferToSCFOptions &options = - ProgressiveVectorTransferToSCFOptions()); - -} // namespace mlir - -#endif // MLIR_CONVERSION_VECTORTOSCF_PROGRESSIVEVECTORTOSCF_H_ diff --git a/mlir/include/mlir/Conversion/VectorToSCF/VectorToSCF.h b/mlir/include/mlir/Conversion/VectorToSCF/VectorToSCF.h --- a/mlir/include/mlir/Conversion/VectorToSCF/VectorToSCF.h +++ b/mlir/include/mlir/Conversion/VectorToSCF/VectorToSCF.h @@ -1,4 +1,4 @@ -//===- VectorToSCF.h - Utils to convert from the vector dialect -*- C++ -*-===// +//===- VectorToSCF.h - Convert vector to SCF dialect ------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -15,57 +15,38 @@ class MLIRContext; class Pass; class RewritePatternSet; -using OwningRewritePatternList = RewritePatternSet; -/// Control whether unrolling is used when lowering vector transfer ops to SCF. +/// When lowering an N-d vector transfer op to an (N-1)-d vector transfer op, +/// a temporary buffer is created through which individual (N-1)-d vector are +/// staged. This pattern can be applied multiple time, until the transfer op +/// is 1-d. +/// This is consistent with the lack of an LLVM instruction to dynamically +/// index into an aggregate (see the Vector dialect lowering to LLVM deep dive). /// -/// Case 1: -/// ======= -/// When `unroll` is false, a temporary buffer is created through which -/// individual 1-D vector are staged. this is consistent with the lack of an -/// LLVM instruction to dynamically index into an aggregate (see the Vector -/// dialect lowering to LLVM deep dive). /// An instruction such as: /// ``` -/// vector.transfer_write %vec, %A[%base, %base] : -/// vector<17x15xf32>, memref +/// vector.transfer_write %vec, %A[%a, %b, %c] : +/// vector<9x17x15xf32>, memref /// ``` -/// Lowers to pseudo-IR resembling: +/// Lowers to pseudo-IR resembling (unpacking one dimension): /// ``` -/// %0 = alloc() : memref<17xvector<15xf32>> +/// %0 = alloca() : memref> +/// store %vec, %0[] : memref> /// %1 = vector.type_cast %0 : -/// memref<17xvector<15xf32>> to memref> -/// store %vec, %1[] : memref> -/// %dim = dim %A, 0 : memref -/// affine.for %I = 0 to 17 { -/// %add = affine.apply %I + %base +/// memref> to memref<9xvector<17x15xf32>> +/// affine.for %I = 0 to 9 { +/// %dim = dim %A, 0 : memref +/// %add = affine.apply %I + %a /// %cmp = cmpi "slt", %add, %dim : index /// scf.if %cmp { -/// %vec_1d = load %0[%I] : memref<17xvector<15xf32>> -/// vector.transfer_write %vec_1d, %A[%add, %base] : -/// vector<15xf32>, memref +/// %vec_2d = load %1[%I] : memref<9xvector<17x15xf32>> +/// vector.transfer_write %vec_2d, %A[%add, %b, %c] : +/// vector<17x15xf32>, memref /// ``` /// -/// Case 2: -/// ======= -/// When `unroll` is true, the temporary buffer is skipped and static indices -/// into aggregates can be used (see the Vector dialect lowering to LLVM deep -/// dive). -/// An instruction such as: -/// ``` -/// vector.transfer_write %vec, %A[%base, %base] : -/// vector<3x15xf32>, memref -/// ``` -/// Lowers to pseudo-IR resembling: -/// ``` -/// %0 = vector.extract %arg2[0] : vector<3x15xf32> -/// vector.transfer_write %0, %arg0[%arg1, %arg1] : vector<15xf32>, -/// memref %1 = affine.apply #map1()[%arg1] %2 = vector.extract -/// %arg2[1] : vector<3x15xf32> vector.transfer_write %2, %arg0[%1, %arg1] : -/// vector<15xf32>, memref %3 = affine.apply #map2()[%arg1] %4 = -/// vector.extract %arg2[2] : vector<3x15xf32> vector.transfer_write %4, -/// %arg0[%3, %arg1] : vector<15xf32>, memref -/// ``` +/// When applying the pattern a second time, the existing alloca() operation +/// is reused and only a second vector.type_cast is added. + struct VectorTransferToSCFOptions { bool unroll = false; VectorTransferToSCFOptions &setUnroll(bool u) { @@ -74,93 +55,6 @@ } }; -/// Implements lowering of TransferReadOp and TransferWriteOp to a -/// proper abstraction for the hardware. -/// -/// There are multiple cases. -/// -/// Case A: Permutation Map does not permute or broadcast. -/// ====================================================== -/// -/// Progressive lowering occurs to 1-D vector transfer ops according to the -/// description in `VectorTransferToSCFOptions`. -/// -/// Case B: Permutation Map permutes and/or broadcasts. -/// ====================================================== -/// -/// This path will be progressively deprecated and folded into the case above by -/// using vector broadcast and transpose operations. -/// -/// This path only emits a simple loop nest that performs clipped pointwise -/// copies from a remote to a locally allocated memory. -/// -/// Consider the case: -/// -/// ```mlir -/// // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into -/// // vector<32x256xf32> and pad with %f0 to handle the boundary case: -/// %f0 = constant 0.0f : f32 -/// scf.for %i0 = 0 to %0 { -/// scf.for %i1 = 0 to %1 step %c256 { -/// scf.for %i2 = 0 to %2 step %c32 { -/// %v = vector.transfer_read %A[%i0, %i1, %i2], %f0 -/// {permutation_map: (d0, d1, d2) -> (d2, d1)} : -/// memref, vector<32x256xf32> -/// }}} -/// ``` -/// -/// The rewriters construct loop and indices that access MemRef A in a pattern -/// resembling the following (while guaranteeing an always full-tile -/// abstraction): -/// -/// ```mlir -/// scf.for %d2 = 0 to %c256 { -/// scf.for %d1 = 0 to %c32 { -/// %s = %A[%i0, %i1 + %d1, %i2 + %d2] : f32 -/// %tmp[%d2, %d1] = %s -/// } -/// } -/// ``` -/// -/// In the current state, only a clipping transfer is implemented by `clip`, -/// which creates individual indexing expressions of the form: -/// -/// ```mlir-dsc -/// auto condMax = i + ii < N; -/// auto max = std_select(condMax, i + ii, N - one) -/// auto cond = i + ii < zero; -/// std_select(cond, zero, max); -/// ``` -/// -/// In the future, clipping should not be the only way and instead we should -/// load vectors + mask them. Similarly on the write side, load/mask/store for -/// implementing RMW behavior. -/// -/// Lowers TransferOp into a combination of: -/// 1. local memory allocation; -/// 2. perfect loop nest over: -/// a. scalar load/stores from local buffers (viewed as a scalar memref); -/// a. scalar store/load to original memref (with clipping). -/// 3. vector_load/store -/// 4. local memory deallocation. -/// Minor variations occur depending on whether a TransferReadOp or -/// a TransferWriteOp is rewritten. -template -struct VectorTransferRewriter : public RewritePattern { - explicit VectorTransferRewriter(VectorTransferToSCFOptions options, - MLIRContext *context); - - /// Used for staging the transfer in a local buffer. - MemRefType tmpMemRefType(TransferOpTy transfer) const; - - /// Performs the rewrite. - LogicalResult matchAndRewrite(Operation *op, - PatternRewriter &rewriter) const override; - - /// See description of `VectorTransferToSCFOptions`. - VectorTransferToSCFOptions options; -}; - /// Collect a set of patterns to convert from the Vector dialect to SCF + std. void populateVectorToSCFConversionPatterns( RewritePatternSet &patterns, diff --git a/mlir/lib/Conversion/VectorToSCF/CMakeLists.txt b/mlir/lib/Conversion/VectorToSCF/CMakeLists.txt --- a/mlir/lib/Conversion/VectorToSCF/CMakeLists.txt +++ b/mlir/lib/Conversion/VectorToSCF/CMakeLists.txt @@ -1,5 +1,4 @@ add_mlir_conversion_library(MLIRVectorToSCF - ProgressiveVectorToSCF.cpp VectorToSCF.cpp ADDITIONAL_HEADER_DIRS diff --git a/mlir/lib/Conversion/VectorToSCF/ProgressiveVectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/ProgressiveVectorToSCF.cpp deleted file mode 100644 --- a/mlir/lib/Conversion/VectorToSCF/ProgressiveVectorToSCF.cpp +++ /dev/null @@ -1,1142 +0,0 @@ -//===- ProgressiveVectorToSCF.h - Convert vector to SCF dialect -*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements lowering of vector transfer operations to SCF. -// -//===----------------------------------------------------------------------===// - -#include - -#include "mlir/Conversion/VectorToSCF/ProgressiveVectorToSCF.h" - -#include "../PassDetail.h" -#include "mlir/Dialect/Affine/EDSC/Intrinsics.h" -#include "mlir/Dialect/MemRef/EDSC/Intrinsics.h" -#include "mlir/Dialect/SCF/EDSC/Intrinsics.h" -#include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h" -#include "mlir/Dialect/Vector/EDSC/Intrinsics.h" -#include "mlir/Dialect/Vector/VectorOps.h" -#include "mlir/Dialect/Vector/VectorUtils.h" -#include "mlir/IR/Builders.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" -#include "mlir/Transforms/Passes.h" - -using namespace mlir; -using namespace mlir::edsc; -using namespace mlir::edsc::intrinsics; -using vector::TransferReadOp; -using vector::TransferWriteOp; - -namespace { - -/// Attribute name used for labeling transfer ops during progressive lowering. -static const char kPassLabel[] = "__vector_to_scf_lowering__"; - -/// Lower to 1D transfer ops. Target-specific lowering will lower those. -static const int64_t kTargetRank = 1; - -/// Given a MemRefType with VectorType element type, unpack one dimension from -/// the VectorType into the MemRefType. -/// -/// E.g.: memref<9xvector<5x6xf32>> --> memref<9x5xvector<6xf32>> -static MemRefType unpackOneDim(MemRefType type) { - auto vectorType = type.getElementType().dyn_cast(); - auto memrefShape = type.getShape(); - SmallVector newMemrefShape; - newMemrefShape.append(memrefShape.begin(), memrefShape.end()); - newMemrefShape.push_back(vectorType.getDimSize(0)); - return MemRefType::get(newMemrefShape, - VectorType::get(vectorType.getShape().drop_front(), - vectorType.getElementType())); -} - -/// Helper data structure for data and mask buffers. -struct BufferAllocs { - Value dataBuffer; - Value maskBuffer; -}; - -/// Allocate temporary buffers for data (vector) and mask (if present). -/// TODO: Parallelism and threadlocal considerations. -template -static BufferAllocs allocBuffers(OpTy xferOp) { - auto &b = ScopedContext::getBuilderRef(); - OpBuilder::InsertionGuard guard(b); - Operation *scope = - xferOp->template getParentWithTrait(); - assert(scope && "Expected op to be inside automatic allocation scope"); - b.setInsertionPointToStart(&scope->getRegion(0).front()); - - BufferAllocs result; - auto bufferType = MemRefType::get({}, xferOp.getVectorType()); - result.dataBuffer = memref_alloca(bufferType).value; - - if (xferOp.mask()) { - auto maskType = MemRefType::get({}, xferOp.mask().getType()); - Value maskBuffer = memref_alloca(maskType); - memref_store(xferOp.mask(), maskBuffer); - result.maskBuffer = memref_load(maskBuffer); - } - - return result; -} - -/// Given a vector transfer op, calculate which dimension of the `source` -/// memref should be unpacked in the next application of TransferOpConversion. -/// A return value of None indicates a broadcast. -template -static Optional unpackedDim(OpTy xferOp) { - auto map = xferOp.permutation_map(); - if (auto expr = map.getResult(0).template dyn_cast()) { - return expr.getPosition(); - } - assert(xferOp.isBroadcastDim(0) && - "Expected AffineDimExpr or AffineConstantExpr"); - return None; -} - -/// Compute the permutation map for the new (N-1)-D vector transfer op. This -/// map is identical to the current permutation map, but the first result is -/// omitted. -template -static AffineMap unpackedPermutationMap(OpTy xferOp, OpBuilder &builder) { - auto map = xferOp.permutation_map(); - return AffineMap::get( - map.getNumDims(), 0, map.getResults().drop_front(), - builder.getContext()); -} - -/// Calculate the indices for the new vector transfer op. -/// -/// E.g.: transfer_read %A[%a, %b, %c, %d] ... : vector<5x4x3xf32> ... -/// --> transfer_read %A[%a, %b + iv, %c, %d] ... vector<4x3f32> -/// ^^^^^^ -/// `iv` is the iteration variable of the (new) surrounding loop. -template -static void getXferIndices(OpTy xferOp, Value iv, - SmallVector &indices) { - typename OpTy::Adaptor adaptor(xferOp); - // Corresponding memref dim of the vector dim that is unpacked. - auto dim = unpackedDim(xferOp); - auto prevIndices = adaptor.indices(); - indices.append(prevIndices.begin(), prevIndices.end()); - - bool isBroadcast = !dim.hasValue(); - if (!isBroadcast) { - using edsc::op::operator+; - indices[dim.getValue()] = adaptor.indices()[dim.getValue()] + iv; - } -} - -static void maybeYieldValue( - bool hasRetVal, OpBuilder builder, Location loc, Value value) { - if (hasRetVal) { - builder.create(loc, value); - } else { - builder.create(loc); - } -} - -/// Generates a boolean Value that is true if the iv-th bit in xferOp's mask -/// is set to true. No such check is generated under following circumstances: -/// * xferOp does not have a mask. -/// * xferOp's mask is not 1D. (In case of (N>1)-D, a subvector of the mask is -/// computed and attached to the new transfer op in the pattern.) -/// * The to-be-unpacked dim of xferOp is a broadcast. -template -static Value generateMaskCheck(OpBuilder &builder, OpTy xferOp, Value iv) { - if (!xferOp.mask()) - return Value(); - if (xferOp.getMaskType().getRank() != 1) - return Value(); - if (xferOp.isBroadcastDim(0)) - return Value(); - - auto ivI32 = std_index_cast(IntegerType::get(builder.getContext(), 32), iv); - return vector_extract_element(xferOp.mask(), ivI32).value; -} - -/// Helper function TransferOpConversion and TransferOp1dConversion. -/// Generate an in-bounds check if the transfer op may go out-of-bounds on the -/// specified dimension `dim` with the loop iteration variable `iv`. -/// E.g., when unpacking dimension 0 from: -/// ``` -/// %vec = vector.transfer_read %A[%a, %b] %cst -/// : vector<5x4xf32>, memref -/// ``` -/// An if check similar to this will be generated inside the loop: -/// ``` -/// %d = memref.dim %A, %c0 : memref -/// if (%a + iv < %d) { -/// (in-bounds case) -/// } else { -/// (out-of-bounds case) -/// } -/// ``` -/// -/// If the transfer is 1D and has a mask, this function generates a more complex -/// check also accounts for potentially masked out elements. -/// -/// This function variant returns the value returned by `inBoundsCase` or -/// `outOfBoundsCase`. The MLIR type of the return value must be specified in -/// `resultTypes`. -template -static Value generateInBoundsCheck( - OpTy xferOp, Value iv, OpBuilder &builder, Optional dim, - TypeRange resultTypes, - function_ref inBoundsCase, - function_ref outOfBoundsCase = nullptr) { - bool hasRetVal = !resultTypes.empty(); - Value cond; // Condition to be built... - - // Condition check 1: Access in-bounds? - bool isBroadcast = !dim.hasValue(); // No in-bounds check for broadcasts. - if (!xferOp.isDimInBounds(0) && !isBroadcast) { - auto memrefDim = - memref_dim(xferOp.source(), std_constant_index(dim.getValue())); - using edsc::op::operator+; - auto memrefIdx = xferOp.indices()[dim.getValue()] + iv; - cond = std_cmpi_sgt(memrefDim.value, memrefIdx); - } - - // Condition check 2: Masked in? - if (auto maskCond = generateMaskCheck(builder, xferOp, iv)) { - if (cond) { - cond = builder.create(xferOp.getLoc(), cond, maskCond); - } else { - cond = maskCond; - } - } - - // If the condition is non-empty, generate an SCF::IfOp. - if (cond) { - auto check = builder.create( - xferOp.getLoc(), resultTypes, cond, - /*thenBuilder=*/[&](OpBuilder &builder, Location loc) { - maybeYieldValue(hasRetVal, builder, loc, inBoundsCase(builder, loc)); - }, /*elseBuilder=*/[&](OpBuilder &builder, Location loc) { - if (outOfBoundsCase) { - maybeYieldValue(hasRetVal, builder, loc, outOfBoundsCase(builder, loc)); - } else { - builder.create(loc); - } - }); - - return hasRetVal ? check.getResult(0) : Value(); - } - - // Condition is empty, no need for an SCF::IfOp. - return inBoundsCase(builder, xferOp.getLoc()); -} - -/// In this function variant, `inBoundsCase` and `outOfBoundsCase` do not have -/// a return value. Consequently, this function does not have a return value. -template -static void generateInBoundsCheck( - OpTy xferOp, Value iv, OpBuilder &builder, Optional dim, - function_ref inBoundsCase, - function_ref outOfBoundsCase = nullptr) { - generateInBoundsCheck( - xferOp, iv, builder, dim, /*resultTypes=*/TypeRange(), - /*inBoundsCase=*/[&](OpBuilder &builder, Location loc) { - inBoundsCase(builder, loc); - return Value(); - }, - /*outOfBoundsCase=*/[&](OpBuilder &builder, Location loc) { - if (outOfBoundsCase) - outOfBoundsCase(builder, loc); - return Value(); - }); -} - -/// Given an ArrayAttr, return a copy where the first element is dropped. -static ArrayAttr dropFirstElem(OpBuilder &builder, ArrayAttr attr) { - if (!attr) - return attr; - return ArrayAttr::get(builder.getContext(), attr.getValue().drop_front()); -} - -/// Add the pass label to a vector transfer op if its rank is not the target -/// rank. -template -static void maybeApplyPassLabel(OpBuilder &builder, OpTy newXferOp) { - if (newXferOp.getVectorType().getRank() > kTargetRank) - newXferOp->setAttr(kPassLabel, builder.getUnitAttr()); -} - -/// Given a transfer op, find the memref from which the mask is loaded. This -/// is similar to Strategy::getBuffer. -template -static Value getMaskBuffer(OpTy xferOp) { - assert(xferOp.mask() && "Expected that transfer op has mask"); - auto loadOp = xferOp.mask().template getDefiningOp(); - assert(loadOp && "Expected transfer op mask produced by LoadOp"); - return loadOp.getMemRef(); -} - -/// Codegen strategy, depending on the operation. -template -struct Strategy; - -/// Code strategy for vector TransferReadOp. -template<> -struct Strategy { - /// Find the StoreOp that is used for writing the current TransferReadOp's - /// result to the temporary buffer allocation. - static memref::StoreOp getStoreOp(TransferReadOp xferOp) { - assert(xferOp->hasOneUse() && "Expected exactly one use of TransferReadOp"); - auto storeOp = dyn_cast( - (*xferOp->use_begin()).getOwner()); - assert(storeOp && "Expected TransferReadOp result used by StoreOp"); - return storeOp; - } - - /// Find the temporary buffer allocation. All labeled TransferReadOps are - /// used like this, where %buf is either the buffer allocation or a type cast - /// of the buffer allocation: - /// ``` - /// %vec = vector.transfer_read ... { __vector_to_scf_lowering__ } ... - /// memref.store %vec, %buf[...] ... - /// ``` - static Value getBuffer(TransferReadOp xferOp) { - return getStoreOp(xferOp).getMemRef(); - } - - /// Retrieve the indices of the current StoreOp that stores into the buffer. - static void getBufferIndices(TransferReadOp xferOp, - SmallVector &indices) { - auto storeOp = getStoreOp(xferOp); - auto prevIndices = memref::StoreOpAdaptor(storeOp).indices(); - indices.append(prevIndices.begin(), prevIndices.end()); - } - - /// Rewrite the TransferReadOp, assuming that there are no out-of-bounds - /// accesses on the to-be-unpacked dimension. - /// - /// 1. Generate a new (N-1)-d TransferReadOp using the loop iteration - /// variable `iv`. - /// 2. Store the result into the (already `vector.type_cast`ed) buffer. - /// - /// E.g.: - /// ``` - /// %vec = vector.transfer_read %A[%a+%i, %b, %c], %cst - /// : memref, vector<4x3xf32> - /// memref.store %vec, %buf[%i] : memref<5xvector<4x3xf32>> - /// ``` - /// Is rewritten to: - /// ``` - /// %casted = vector.type_cast %buf - /// : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>> - /// for %j = 0 to 4 { - /// %vec = vector.transfer_read %A[%a+%i, %b+%j, %c], %cst - /// : memref, vector<3xf32> - /// memref.store %vec, %casted[%i, %j] : memref<5x4xvector<3xf32>> - /// } - /// ``` - /// - /// Note: The loop and type cast are generated in TransferOpConversion. - /// The original TransferReadOp and store op are deleted in `cleanup`. - /// Note: The `mask` operand is set in TransferOpConversion. - static TransferReadOp rewriteOp(OpBuilder &builder, TransferReadOp xferOp, - Value buffer, Value iv) { - SmallVector storeIndices; - getBufferIndices(xferOp, storeIndices); - storeIndices.push_back(iv); - - SmallVector xferIndices; - getXferIndices(xferOp, iv, xferIndices); - - auto bufferType = buffer.getType().dyn_cast(); - auto vecType = bufferType.getElementType().dyn_cast(); - auto inBoundsAttr = dropFirstElem(builder, xferOp.in_boundsAttr()); - auto newXfer = vector_transfer_read( - vecType, xferOp.source(), xferIndices, - AffineMapAttr::get(unpackedPermutationMap(xferOp, builder)), - xferOp.padding(), Value(), inBoundsAttr).value; - - maybeApplyPassLabel(builder, - dyn_cast(newXfer.getDefiningOp())); - - memref_store(newXfer, buffer, storeIndices); - return newXfer.getDefiningOp(); - } - - /// Handle out-of-bounds accesses on the to-be-unpacked dimension: Write - /// padding value to the temporary buffer. - static void handleOutOfBoundsDim( - OpBuilder &/*builder*/, TransferReadOp xferOp, Value buffer, - Value iv) { - SmallVector storeIndices; - getBufferIndices(xferOp, storeIndices); - storeIndices.push_back(iv); - - auto bufferType = buffer.getType().dyn_cast(); - auto vecType = bufferType.getElementType().dyn_cast(); - auto vec = std_splat(vecType, xferOp.padding()); - memref_store(vec, buffer, storeIndices); - } - - /// Cleanup after rewriting the op. - static void cleanup(PatternRewriter &rewriter, TransferReadOp xferOp) { - rewriter.eraseOp(getStoreOp(xferOp)); - rewriter.eraseOp(xferOp); - } -}; - -/// Codegen strategy for vector TransferWriteOp. -template<> -struct Strategy { - /// Find the temporary buffer allocation. All labeled TransferWriteOps are - /// used like this, where %buf is either the buffer allocation or a type cast - /// of the buffer allocation: - /// ``` - /// %vec = memref.load %buf[...] ... - /// vector.transfer_write %vec ... { __vector_to_scf_lowering__ } ... - /// ``` - static Value getBuffer(TransferWriteOp xferOp) { - auto loadOp = xferOp.vector().getDefiningOp(); - assert(loadOp && "Expected transfer op vector produced by LoadOp"); - return loadOp.getMemRef(); - } - - /// Retrieve the indices of the current LoadOp that loads from the buffer. - static void getBufferIndices(TransferWriteOp xferOp, - SmallVector &indices) { - auto loadOp = xferOp.vector().getDefiningOp(); - auto prevIndices = memref::LoadOpAdaptor(loadOp).indices(); - indices.append(prevIndices.begin(), prevIndices.end()); - } - - /// Rewrite the TransferWriteOp, assuming that there are no out-of-bounds - /// accesses on the to-be-unpacked dimension. - /// - /// 1. Load an (N-1)-d vector from the (already `vector.type_cast`ed) buffer, - /// using the loop iteration variable `iv`. - /// 2. Generate a new (N-1)-d TransferWriteOp, writing the loaded vector back - /// to memory. - /// - /// Note: For more details, see comments on Strategy. - static TransferWriteOp rewriteOp(OpBuilder &builder, TransferWriteOp xferOp, - Value buffer, Value iv) { - SmallVector loadIndices; - getBufferIndices(xferOp, loadIndices); - loadIndices.push_back(iv); - - SmallVector xferIndices; - getXferIndices(xferOp, iv, xferIndices); - - auto vec = memref_load(buffer, loadIndices); - auto inBoundsAttr = dropFirstElem(builder, xferOp.in_boundsAttr()); - auto newXfer = vector_transfer_write( - Type(), vec, xferOp.source(), xferIndices, - AffineMapAttr::get(unpackedPermutationMap(xferOp, builder)), - Value(), inBoundsAttr); - - maybeApplyPassLabel(builder, newXfer.op); - - return newXfer; - } - - /// Handle out-of-bounds accesses on the to-be-unpacked dimension. - static void handleOutOfBoundsDim( - OpBuilder &builder, TransferWriteOp xferOp, Value buffer, - Value iv) {} - - /// Cleanup after rewriting the op. - static void cleanup(PatternRewriter &rewriter, TransferWriteOp xferOp) { - rewriter.eraseOp(xferOp); - } -}; - -template -LogicalResult checkPrepareXferOp(OpTy xferOp) { - if (xferOp->hasAttr(kPassLabel)) - return failure(); - if (xferOp.getVectorType().getRank() <= kTargetRank) - return failure(); - return success(); -} - -/// Prepare a TransferReadOp for progressive lowering. -/// -/// 1. Allocate a temporary buffer. -/// 2. Label the TransferReadOp, marking it eligible for progressive lowering. -/// 3. Store the result of the TransferReadOp into the temporary buffer. -/// 4. Load the result from the temporary buffer and replace all uses of the -/// original TransferReadOp with this load. -/// -/// E.g.: -/// ``` -/// %vec = vector.transfer_read %A[%a, %b, %c], %cst -/// : vector<5x4xf32>, memref -/// ``` -/// is rewritten to: -/// ``` -/// %0 = memref.alloca() : memref> -/// %1 = vector.transfer_read %A[%a, %b, %c], %cst -/// { __vector_to_scf_lowering__ } : vector<5x4xf32>, memref -/// memref.store %1, %0[] : memref> -/// %vec = memref.load %0[] : memref> -/// ``` -/// -/// Note: A second temporary buffer may be allocated for the `mask` operand. -struct PrepareTransferReadConversion - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(TransferReadOp xferOp, - PatternRewriter &rewriter) const override { - if (checkPrepareXferOp(xferOp).failed()) - return failure(); - - ScopedContext scope(rewriter, xferOp.getLoc()); - auto buffers = allocBuffers(xferOp); - auto *newXfer = rewriter.clone(*xferOp.getOperation()); - newXfer->setAttr(kPassLabel, rewriter.getUnitAttr()); - if (xferOp.mask()) { - dyn_cast(newXfer).maskMutable().assign( - buffers.maskBuffer); - } - - memref_store(newXfer->getResult(0), buffers.dataBuffer); - rewriter.replaceOpWithNewOp(xferOp, buffers.dataBuffer); - - return success(); - } -}; - -/// Prepare a TransferWriteOp for progressive lowering. -/// -/// 1. Allocate a temporary buffer. -/// 2. Store the vector into the buffer. -/// 3. Load the vector from the buffer again. -/// 4. Use the loaded vector as a TransferWriteOp operand and label the op, -/// marking it eligible for progressive lowering via TransferOpConversion. -/// -/// E.g.: -/// ``` -/// vector.transfer_write %vec, %A[%a, %b, %c] -/// : vector<5x4xf32>, memref -/// ``` -/// is rewritten to: -/// ``` -/// %0 = memref.alloca() : memref> -/// memref.store %vec, %0[] : memref> -/// %1 = memref.load %0[] : memref> -/// vector.transfer_write %1, %A[%a, %b, %c] { __vector_to_scf_lowering__ } -/// : vector<5x4xf32>, memref -/// ``` -/// -/// Note: A second temporary buffer may be allocated for the `mask` operand. -struct PrepareTransferWriteConversion - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(TransferWriteOp xferOp, - PatternRewriter &rewriter) const override { - if (checkPrepareXferOp(xferOp).failed()) - return failure(); - - ScopedContext scope(rewriter, xferOp.getLoc()); - auto buffers = allocBuffers(xferOp); - memref_store(xferOp.vector(), buffers.dataBuffer); - auto loadedVec = memref_load(buffers.dataBuffer); - rewriter.updateRootInPlace(xferOp, [&]() { - xferOp.vectorMutable().assign(loadedVec); - xferOp->setAttr(kPassLabel, rewriter.getUnitAttr()); - }); - - if (xferOp.mask()) { - rewriter.updateRootInPlace( - xferOp, [&]() { xferOp.maskMutable().assign(buffers.maskBuffer); }); - } - - return success(); - } -}; - -/// Progressive lowering of vector transfer ops: Unpack one dimension. -/// -/// 1. Unpack one dimension from the current buffer type and cast the buffer -/// to that new type. E.g.: -/// ``` -/// %vec = memref.load %0[%1] : memref<5xvector<4x3xf32>> -/// vector.transfer_write %vec ... -/// ``` -/// The following cast is generated: -/// ``` -/// %casted = vector.type_cast %0 -/// : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>> -/// ``` -/// 2. Generate a for loop and rewrite the transfer op according to the -/// corresponding Strategy. If the to-be-unpacked dimension can be -/// out-of-bounds, generate an if-check and handle both cases separately. -/// 3. Clean up according to the corresponding Strategy. -template -struct TransferOpConversion : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(OpTy xferOp, - PatternRewriter &rewriter) const override { - if (!xferOp->hasAttr(kPassLabel)) - return failure(); - - ScopedContext scope(rewriter, xferOp.getLoc()); - - // Find and cast data buffer. How the buffer can be found depends on OpTy. - auto dataBuffer = Strategy::getBuffer(xferOp); - auto dataBufferType = dataBuffer.getType().template dyn_cast(); - auto castedDataType = unpackOneDim(dataBufferType); - auto castedDataBuffer = vector_type_cast(castedDataType, dataBuffer); - - // If the xferOp has a mask: Find and cast mask buffer. - Value castedMaskBuffer; - if (xferOp.mask()) { - auto maskBuffer = getMaskBuffer(xferOp); - auto maskBufferType = - maskBuffer.getType().template dyn_cast(); - if (xferOp.isBroadcastDim(0) || xferOp.getMaskType().getRank() == 1) { - // Do not unpack a dimension of the mask, if: - // * To-be-unpacked transfer op dimension is a broadcast. - // * Mask is 1D, i.e., the mask cannot be further unpacked. - // (That means that all remaining dimensions of the transfer op must - // be broadcasted.) - castedMaskBuffer = maskBuffer; - } else { - auto castedMaskType = unpackOneDim(maskBufferType); - castedMaskBuffer = vector_type_cast(castedMaskType, maskBuffer); - } - } - - // Loop bounds and step. - auto lb = std_constant_index(0).value; - auto ub = std_constant_index( - castedDataType.getDimSize(castedDataType.getRank() - 1)) - .value; - auto step = std_constant_index(1).value; - - // Generate for loop. - rewriter.create( - xferOp.getLoc(), lb, ub, step, ValueRange(), - [&](OpBuilder &b, Location loc, Value iv, - ValueRange /*loopState*/) { - ScopedContext scope(b, loc); - generateInBoundsCheck( - xferOp, iv, b, unpackedDim(xferOp), - /*inBoundsCase=*/ - [&](OpBuilder &b, Location /*loc*/) { - // Create new transfer op. - OpTy newXfer = - Strategy::rewriteOp(b, xferOp, castedDataBuffer, iv); - - // If old transfer op has a mask: Set mask on new transfer op. - // Special case: If the mask of the old transfer op is 1D and the - // unpacked dim is not a broadcast, no mask is needed - // on the new transfer op. - if (xferOp.mask() && (xferOp.isBroadcastDim(0) || - xferOp.getMaskType().getRank() > 1)) { - OpBuilder::InsertionGuard guard(b); - b.setInsertionPoint(newXfer); // Insert load before newXfer. - - SmallVector loadIndices; - Strategy::getBufferIndices(xferOp, loadIndices); - // In case of broadcast: Use same indices to load from memref as - // before. - if (!xferOp.isBroadcastDim(0)) - loadIndices.push_back(iv); - - auto mask = memref_load(castedMaskBuffer, loadIndices); - rewriter.updateRootInPlace( - newXfer, [&]() { newXfer.maskMutable().assign(mask); }); - } - }, - /*outOfBoundsCase=*/ - [&](OpBuilder &b, Location /*loc*/) { - Strategy::handleOutOfBoundsDim(b, xferOp, castedDataBuffer, - iv); - }); - b.create(loc); - }); - - Strategy::cleanup(rewriter, xferOp); - return success(); - } -}; - -/// If the original transfer op has a mask, compute the mask of the new transfer -/// op (for the current iteration `i`) and assign it. -template -static void maybeAssignMask(OpBuilder &builder, OpTy xferOp, OpTy newXferOp, - int64_t i) { - if (!xferOp.mask()) - return; - - if (xferOp.isBroadcastDim(0)) { - // To-be-unpacked dimension is a broadcast, which does not have a - // corresponding mask dimension. Mask attribute remains unchanged. - newXferOp.maskMutable().assign(xferOp.mask()); - return; - } - - if (xferOp.getMaskType().getRank() > 1) { - // Unpack one dimension of the mask. - OpBuilder::InsertionGuard guard(builder); - builder.setInsertionPoint(newXferOp); // Insert load before newXfer. - - llvm::SmallVector indices({i}); - auto newMask = vector_extract(xferOp.mask(), indices).value; - newXferOp.maskMutable().assign(newMask); - } - - // If we end up here: The mask of the old transfer op is 1D and the unpacked - // dim is not a broadcast, so no mask is needed on the new transfer op. - // `generateInBoundsCheck` will have evaluated the mask already. -} - -/// Progressive lowering of vector TransferReadOp with unrolling: Unpack one -/// dimension. This is similar to TransferOpConversion, but no -/// memref buffer is allocated and the SCF loop is fully unrolled. -/// -/// ``` -/// E.g.: -/// ``` -/// %vec = vector.transfer_read %A[%a, %b, %c], %padding -/// : memref, vector<5x4xf32> -/// ``` -/// is rewritten to IR such as (simplified): -/// ``` -/// %v_init = splat %padding : vector<5x4xf32> -/// %tmp0 = vector.transfer_read %A[%a, %b, %c], %padding -/// : memref, vector<4xf32> -/// %v0 = vector.insert %tmp0, %v_init[0] : vector<4xf32> into vector<5x4xf32> -/// %tmp1 = vector.transfer_read %A[%a, %b + 1, %c], %padding -/// : memref, vector<4xf32> -/// %v1 = vector.insert %tmp1, %v0[1] : vector<4xf32> into vector<5x4xf32> -/// ... -/// %tmp4 = vector.transfer_read %A[%a, %b + 4, %c], %padding -/// : memref, vector<4xf32> -/// %vec = vector.insert %tmp1, %v3[4] : vector<4xf32> into vector<5x4xf32> -/// ``` -/// -/// Note: As an optimization, if the result of the original TransferReadOp -/// was directly inserted into another vector, no new %v_init vector is created. -/// Instead, the new TransferReadOp results are inserted into that vector. -struct UnrollTransferReadConversion : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - /// Return the vector into which the newly created TransferReadOp results - /// are inserted. - Value getResultVector(TransferReadOp xferOp, - PatternRewriter &rewriter) const { - if (auto insertOp = getInsertOp(xferOp)) - return insertOp.dest(); - return std_splat(xferOp.getVectorType(), xferOp.padding()).value; - } - - /// If the result of the TransferReadOp has exactly one user, which is a - /// vector::InsertOp, return that operation. - vector::InsertOp getInsertOp(TransferReadOp xferOp) const { - if (xferOp->hasOneUse()) { - Operation *xferOpUser = *xferOp->getUsers().begin(); - if (auto insertOp = dyn_cast(xferOpUser)) - return insertOp; - } - - return vector::InsertOp(); - } - - /// If the result of the TransferReadOp has exactly one user, which is a - /// vector::InsertOp, return that operation's indices. - void getInsertionIndices(TransferReadOp xferOp, - SmallVector &indices) const { - if (auto insertOp = getInsertOp(xferOp)) { - llvm::for_each(insertOp.position(), [&](Attribute attr) { - indices.push_back(attr.dyn_cast().getInt()); - }); - } - } - - /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds - /// accesses, and broadcasts and transposes in permutation maps. - LogicalResult matchAndRewrite(TransferReadOp xferOp, - PatternRewriter &rewriter) const override { - if (xferOp.getVectorType().getRank() <= kTargetRank) - return failure(); - - ScopedContext scope(rewriter, xferOp.getLoc()); - auto insertOp = getInsertOp(xferOp); - auto vec = getResultVector(xferOp, rewriter); - auto vecType = vec.getType().dyn_cast(); - auto xferVecType = xferOp.getVectorType(); - auto newXferVecType = VectorType::get(xferVecType.getShape().drop_front(), - xferVecType.getElementType()); - int64_t dimSize = xferVecType.getShape()[0]; - - // Generate fully unrolled loop of transfer ops. - for (int64_t i = 0; i < dimSize; ++i) { - Value iv = std_constant_index(i); - - vec = generateInBoundsCheck( - xferOp, iv, rewriter, unpackedDim(xferOp), TypeRange(vecType), - /*inBoundsCase=*/ - [&](OpBuilder &b, Location loc) { - ScopedContext scope(b, loc); - - // Indices for the new transfer op. - SmallVector xferIndices; - getXferIndices(xferOp, iv, xferIndices); - - // Indices for the new vector.insert op. - SmallVector insertionIndices; - getInsertionIndices(xferOp, insertionIndices); - insertionIndices.push_back(i); - - auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr()); - auto newXferOpVal = - vector_transfer_read( - newXferVecType, xferOp.source(), xferIndices, - AffineMapAttr::get(unpackedPermutationMap(xferOp, b)), - xferOp.padding(), Value(), inBoundsAttr) - .value; - auto newXferOp = - dyn_cast(newXferOpVal.getDefiningOp()); - - maybeAssignMask(b, xferOp, newXferOp, i); - - return vector_insert(newXferOp, vec, insertionIndices).value; - }, - /*outOfBoundsCase=*/ - [&](OpBuilder &b, Location loc) { - // Loop through original (unmodified) vector. - return vec; - }); - } - - if (insertOp) { - // Rewrite single user of the old TransferReadOp, which was an InsertOp. - rewriter.replaceOp(insertOp, vec); - rewriter.eraseOp(xferOp); - } else { - rewriter.replaceOp(xferOp, vec); - } - - return success(); - } -}; - -/// Progressive lowering of vector TransferWriteOp with unrolling: Unpack one -/// dimension. This is similar to TransferOpConversion, but no -/// memref buffer is allocated and the SCF loop is fully unrolled. -/// -/// ``` -/// E.g.: -/// ``` -/// vector.transfer_write %vec, %A[%a, %b, %c] -/// : vector<5x4xf32>, memref -/// ``` -/// is rewritten to IR such as (simplified): -/// ``` -/// %v0 = vector.extract %vec[0] : vector<5x4xf32> -/// vector.transfer_write %v0, %A[%a, %b, %c] : vector<4xf32>, memref<...> -/// %v1 = vector.extract %vec[1] : vector<5x4xf32> -/// vector.transfer_write %v1, %A[%a, %b + 1, %c] : vector<4xf32>, memref<...> -/// ... -/// %v4 = vector.extract %vec[4] : vector<5x4xf32> -/// vector.transfer_write %v4, %A[%a, %b + 4, %c] : vector<4xf32>, memref<...> -/// ``` -/// -/// Note: As an optimization, if the vector of the original TransferWriteOp -/// was directly extracted from another vector via an ExtractOp `a`, extract -/// the vectors for the newly generated TransferWriteOps from `a`'s input. By -/// doing so, `a` may become dead, and the number of ExtractOps generated during -/// recursive application of this pattern will be minimal. -struct UnrollTransferWriteConversion - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - /// Return the vector from which newly generated ExtracOps will extract. - Value getDataVector(TransferWriteOp xferOp) const { - if (auto extractOp = getExtractOp(xferOp)) - return extractOp.vector(); - return xferOp.vector(); - } - - /// If the input of the given TransferWriteOp is an ExtractOp, return it. - vector::ExtractOp getExtractOp(TransferWriteOp xferOp) const { - if (auto *op = xferOp.vector().getDefiningOp()) - return dyn_cast(op); - return vector::ExtractOp(); - } - - /// If the input of the given TransferWriteOp is an ExtractOp, return its - /// indices. - void getExtractionIndices(TransferWriteOp xferOp, - SmallVector &indices) const { - if (auto extractOp = getExtractOp(xferOp)) { - llvm::for_each(extractOp.position(), [&](Attribute attr) { - indices.push_back(attr.dyn_cast().getInt()); - }); - } - } - - /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds - /// accesses, and broadcasts and transposes in permutation maps. - LogicalResult matchAndRewrite(TransferWriteOp xferOp, - PatternRewriter &rewriter) const override { - if (xferOp.getVectorType().getRank() <= kTargetRank) - return failure(); - - ScopedContext scope(rewriter, xferOp.getLoc()); - auto vec = getDataVector(xferOp); - auto xferVecType = xferOp.getVectorType(); - int64_t dimSize = xferVecType.getShape()[0]; - - // Generate fully unrolled loop of transfer ops. - for (int64_t i = 0; i < dimSize; ++i) { - Value iv = std_constant_index(i); - - generateInBoundsCheck( - xferOp, iv, rewriter, unpackedDim(xferOp), - /*inBoundsCase=*/[&](OpBuilder &b, Location loc) { - ScopedContext scope(b, loc); - - // Indices for the new transfer op. - SmallVector xferIndices; - getXferIndices(xferOp, iv, xferIndices); - - // Indices for the new vector.extract op. - SmallVector extractionIndices; - getExtractionIndices(xferOp, extractionIndices); - extractionIndices.push_back(i); - - auto extracted = vector_extract(vec, extractionIndices).value; - auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr()); - - auto newXferOp = - vector_transfer_write( - Type(), extracted, xferOp.source(), xferIndices, - AffineMapAttr::get(unpackedPermutationMap(xferOp, b)), - Value(), inBoundsAttr) - .op; - - maybeAssignMask(b, xferOp, newXferOp, i); - }); - } - - rewriter.eraseOp(xferOp); - return success(); - } -}; - -/// Compute the indices into the memref for the LoadOp/StoreOp generated as -/// part of TransferOp1dConversion. Return the memref dimension on which -/// the transfer is operating. A return value of None indicates a broadcast. -template -static Optional get1dMemrefIndices( - OpTy xferOp, Value iv, SmallVector &memrefIndices) { - auto indices = xferOp.indices(); - auto map = xferOp.permutation_map(); - - memrefIndices.append(indices.begin(), indices.end()); - assert(map.getNumResults() == 1 && - "Expected 1 permutation map result for 1D transfer"); - if (auto expr = map.getResult(0).template dyn_cast()) { - auto dim = expr.getPosition(); - using edsc::op::operator+; - memrefIndices[dim] = memrefIndices[dim] + iv; - return dim; - } - - assert(xferOp.isBroadcastDim(0) && - "Expected AffineDimExpr or AffineConstantExpr"); - return None; -} - -/// Codegen strategy for TransferOp1dConversion, depending on the -/// operation. -template -struct Strategy1d; - -/// Codegen strategy for TransferReadOp. -template <> -struct Strategy1d { - static void generateForLoopBody( - OpBuilder &builder, Location loc, TransferReadOp xferOp, Value iv, - ValueRange loopState) { - SmallVector indices; - auto dim = get1dMemrefIndices(xferOp, iv, indices); - auto ivI32 = std_index_cast( - IntegerType::get(builder.getContext(), 32), iv); - auto vec = loopState[0]; - - // In case of out-of-bounds access, leave `vec` as is (was initialized with - // padding value). - auto nextVec = generateInBoundsCheck( - xferOp, iv, builder, dim, TypeRange(xferOp.getVectorType()), - /*inBoundsCase=*/[&](OpBuilder& /*b*/, Location loc) { - auto val = memref_load(xferOp.source(), indices); - return vector_insert_element(val, vec, ivI32.value).value; - }, /*outOfBoundsCase=*/[&](OpBuilder& /*b*/, Location loc) { - return vec; - }); - builder.create(loc, nextVec); - } - - static Value initialLoopState(TransferReadOp xferOp) { - // Inititalize vector with padding value. - return std_splat(xferOp.getVectorType(), xferOp.padding()).value; - } -}; - -/// Codegen strategy for TransferWriteOp. -template <> -struct Strategy1d { - static void generateForLoopBody( - OpBuilder &builder, Location loc, TransferWriteOp xferOp, Value iv, - ValueRange /*loopState*/) { - SmallVector indices; - auto dim = get1dMemrefIndices(xferOp, iv, indices); - auto ivI32 = std_index_cast( - IntegerType::get(builder.getContext(), 32), iv); - - // Nothing to do in case of out-of-bounds access. - generateInBoundsCheck( - xferOp, iv, builder, dim, - /*inBoundsCase=*/[&](OpBuilder& /*b*/, Location loc) { - auto val = vector_extract_element(xferOp.vector(), ivI32.value); - memref_store(val, xferOp.source(), indices); - }); - builder.create(loc); - } - - static Value initialLoopState(TransferWriteOp xferOp) { - return Value(); - } -}; - -/// Return true if the last dimension of the MemRefType has unit stride. -static bool isLastMemrefDimUnitStride(MemRefType type) { - int64_t offset; - SmallVector strides; - auto successStrides = getStridesAndOffset(type, strides, offset); - return succeeded(successStrides) && strides.back() == 1; -} - -/// Lower a 1D vector transfer op to SCF using scalar loads/stores. This is -/// necessary in cases where a 1D vector transfer op cannot be lowered into -/// vector load/stores due to non-unit strides or broadcasts: -/// -/// * Transfer dimension is not the last memref dimension -/// * Transfer dimension is a broadcast (i.e., scalar load + broadcast) -/// * Memref has a layout map with non-unit stride on the last dimension -/// -/// This pattern generates IR as follows: -/// -/// 1. Generate a for loop iterating over each vector element. -/// 2. Inside the loop, generate a InsertElementOp or ExtractElementOp, -/// depending on OpTy. -/// -/// TODO: In some cases (no masking, etc.), LLVM::MatrixColumnMajorLoadOp -/// can be generated instead of TransferOp1dConversion. Add such a pattern -/// to ConvertVectorToLLVM. -/// -/// E.g.: -/// ``` -/// vector.transfer_write %vec, %A[%a, %b] -/// {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]} -/// : vector<9xf32>, memref -/// ``` -/// Is rewritten to approximately the following pseudo-IR: -/// ``` -/// for i = 0 to 9 { -/// %t = vector.extractelement %vec[i] : vector<9xf32> -/// memref.store %t, %arg0[%a + i, %b] : memref -/// } -/// ``` -template -struct TransferOp1dConversion : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(OpTy xferOp, - PatternRewriter &rewriter) const override { - ScopedContext scope(rewriter, xferOp.getLoc()); - auto map = xferOp.permutation_map(); - auto memRefType = xferOp.getShapedType().template dyn_cast(); - - if (!memRefType) - return failure(); - if (xferOp.getVectorType().getRank() != 1) - return failure(); - if (map.isMinorIdentity() && isLastMemrefDimUnitStride(memRefType)) - return failure(); // Handled by ConvertVectorToLLVM - - // Loop bounds, step, state... - auto vecType = xferOp.getVectorType(); - auto lb = std_constant_index(0); - auto ub = std_constant_index(vecType.getDimSize(0)); - auto step = std_constant_index(1); - auto loopState = Strategy1d::initialLoopState(xferOp); - - // Generate for loop. - rewriter.replaceOpWithNewOp( - xferOp, lb, ub, step, loopState ? ValueRange(loopState) : ValueRange(), - [&](OpBuilder &builder, Location loc, Value iv, ValueRange loopState) { - ScopedContext nestedScope(builder, loc); - Strategy1d::generateForLoopBody( - builder, loc, xferOp, iv, loopState); - }); - - return success(); - } -}; - -} // namespace - -namespace mlir { - -void populateProgressiveVectorToSCFConversionPatterns( - RewritePatternSet &patterns, - const ProgressiveVectorTransferToSCFOptions &options) { - if (options.unroll) { - patterns.add( - patterns.getContext()); - } else { - patterns.add, - TransferOpConversion>(patterns.getContext()); - } - - if (kTargetRank == 1) { - patterns.add, - TransferOp1dConversion>( - patterns.getContext()); - } -} - -struct ConvertProgressiveVectorToSCFPass - : public ConvertVectorToSCFBase { - ConvertProgressiveVectorToSCFPass( - const ProgressiveVectorTransferToSCFOptions &opt) - : options(opt) {} - - void runOnFunction() override { - RewritePatternSet patterns(getFunction().getContext()); - populateProgressiveVectorToSCFConversionPatterns(patterns, options); - (void)applyPatternsAndFoldGreedily(getFunction(), std::move(patterns)); - } - - ProgressiveVectorTransferToSCFOptions options; -}; - -} // namespace mlir - -std::unique_ptr mlir::createProgressiveConvertVectorToSCFPass( - const ProgressiveVectorTransferToSCFOptions &options) { - return std::make_unique(options); -} diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp --- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp +++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp @@ -1,4 +1,4 @@ -//===- VectorToSCF.cpp - Conversion from Vector to mix of SCF and Std -----===// +//===- VectorToSCF.cpp - Convert vector to SCF dialect ----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file implements target-dependent lowering of vector transfer operations. +// This file implements lowering of vector transfer operations to SCF. // //===----------------------------------------------------------------------===// @@ -17,16 +17,12 @@ #include "../PassDetail.h" #include "mlir/Dialect/Affine/EDSC/Intrinsics.h" #include "mlir/Dialect/MemRef/EDSC/Intrinsics.h" -#include "mlir/Dialect/SCF/EDSC/Builders.h" #include "mlir/Dialect/SCF/EDSC/Intrinsics.h" #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h" #include "mlir/Dialect/Vector/EDSC/Intrinsics.h" #include "mlir/Dialect/Vector/VectorOps.h" #include "mlir/Dialect/Vector/VectorUtils.h" -#include "mlir/IR/AffineExpr.h" -#include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" -#include "mlir/IR/Matchers.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/Passes.h" @@ -37,672 +33,1091 @@ using vector::TransferReadOp; using vector::TransferWriteOp; -// Return a list of Values that correspond to multiple AffineApplyOp, one for -// each result of `map`. Each `expr` in `map` is canonicalized and folded -// greedily according to its operands. -// TODO: factor out in a common location that both linalg and vector can use. -static SmallVector -applyMapToValues(OpBuilder &b, Location loc, AffineMap map, ValueRange values) { - SmallVector res; - res.reserve(map.getNumResults()); - unsigned numDims = map.getNumDims(), numSym = map.getNumSymbols(); - // For each `expr` in `map`, applies the `expr` to the values extracted from - // ranges. If the resulting application can be folded into a Value, the - // folding occurs eagerly. Otherwise, an affine.apply operation is emitted. - for (auto expr : map.getResults()) { - AffineMap map = AffineMap::get(numDims, numSym, expr); - SmallVector operands(values.begin(), values.end()); - fullyComposeAffineMapAndOperands(&map, &operands); - canonicalizeMapAndOperands(&map, &operands); - res.push_back(b.createOrFold(loc, map, operands)); - } - return res; +namespace { + +/// Attribute name used for labeling transfer ops during progressive lowering. +static const char kPassLabel[] = "__vector_to_scf_lowering__"; + +/// Lower to 1D transfer ops. Target-specific lowering will lower those. +static const int64_t kTargetRank = 1; + +/// Given a MemRefType with VectorType element type, unpack one dimension from +/// the VectorType into the MemRefType. +/// +/// E.g.: memref<9xvector<5x6xf32>> --> memref<9x5xvector<6xf32>> +static MemRefType unpackOneDim(MemRefType type) { + auto vectorType = type.getElementType().dyn_cast(); + auto memrefShape = type.getShape(); + SmallVector newMemrefShape; + newMemrefShape.append(memrefShape.begin(), memrefShape.end()); + newMemrefShape.push_back(vectorType.getDimSize(0)); + return MemRefType::get(newMemrefShape, + VectorType::get(vectorType.getShape().drop_front(), + vectorType.getElementType())); } -namespace { -/// Helper class captures the common information needed to lower N>1-D vector -/// transfer operations (read and write). -/// On construction, this class opens an edsc::ScopedContext for simpler IR -/// manipulation. -/// In pseudo-IR, for an n-D vector_transfer_read such as: +/// Helper data structure for data and mask buffers. +struct BufferAllocs { + Value dataBuffer; + Value maskBuffer; +}; + +/// Allocate temporary buffers for data (vector) and mask (if present). +/// TODO: Parallelism and threadlocal considerations. +template +static BufferAllocs allocBuffers(OpTy xferOp) { + auto &b = ScopedContext::getBuilderRef(); + OpBuilder::InsertionGuard guard(b); + Operation *scope = + xferOp->template getParentWithTrait(); + assert(scope && "Expected op to be inside automatic allocation scope"); + b.setInsertionPointToStart(&scope->getRegion(0).front()); + + BufferAllocs result; + auto bufferType = MemRefType::get({}, xferOp.getVectorType()); + result.dataBuffer = memref_alloca(bufferType).value; + + if (xferOp.mask()) { + auto maskType = MemRefType::get({}, xferOp.mask().getType()); + Value maskBuffer = memref_alloca(maskType); + memref_store(xferOp.mask(), maskBuffer); + result.maskBuffer = memref_load(maskBuffer); + } + + return result; +} + +/// Given a vector transfer op, calculate which dimension of the `source` +/// memref should be unpacked in the next application of TransferOpConversion. +/// A return value of None indicates a broadcast. +template +static Optional unpackedDim(OpTy xferOp) { + auto map = xferOp.permutation_map(); + if (auto expr = map.getResult(0).template dyn_cast()) { + return expr.getPosition(); + } + assert(xferOp.isBroadcastDim(0) && + "Expected AffineDimExpr or AffineConstantExpr"); + return None; +} + +/// Compute the permutation map for the new (N-1)-D vector transfer op. This +/// map is identical to the current permutation map, but the first result is +/// omitted. +template +static AffineMap unpackedPermutationMap(OpTy xferOp, OpBuilder &builder) { + auto map = xferOp.permutation_map(); + return AffineMap::get(map.getNumDims(), 0, map.getResults().drop_front(), + builder.getContext()); +} + +/// Calculate the indices for the new vector transfer op. /// +/// E.g.: transfer_read %A[%a, %b, %c, %d] ... : vector<5x4x3xf32> ... +/// --> transfer_read %A[%a, %b + iv, %c, %d] ... vector<4x3f32> +/// ^^^^^^ +/// `iv` is the iteration variable of the (new) surrounding loop. +template +static void getXferIndices(OpTy xferOp, Value iv, + SmallVector &indices) { + typename OpTy::Adaptor adaptor(xferOp); + // Corresponding memref dim of the vector dim that is unpacked. + auto dim = unpackedDim(xferOp); + auto prevIndices = adaptor.indices(); + indices.append(prevIndices.begin(), prevIndices.end()); + + bool isBroadcast = !dim.hasValue(); + if (!isBroadcast) { + using edsc::op::operator+; + indices[dim.getValue()] = adaptor.indices()[dim.getValue()] + iv; + } +} + +static void maybeYieldValue(bool hasRetVal, OpBuilder builder, Location loc, + Value value) { + if (hasRetVal) { + builder.create(loc, value); + } else { + builder.create(loc); + } +} + +/// Generates a boolean Value that is true if the iv-th bit in xferOp's mask +/// is set to true. No such check is generated under following circumstances: +/// * xferOp does not have a mask. +/// * xferOp's mask is not 1D. (In case of (N>1)-D, a subvector of the mask is +/// computed and attached to the new transfer op in the pattern.) +/// * The to-be-unpacked dim of xferOp is a broadcast. +template +static Value generateMaskCheck(OpBuilder &builder, OpTy xferOp, Value iv) { + if (!xferOp.mask()) + return Value(); + if (xferOp.getMaskType().getRank() != 1) + return Value(); + if (xferOp.isBroadcastDim(0)) + return Value(); + + auto ivI32 = std_index_cast(IntegerType::get(builder.getContext(), 32), iv); + return vector_extract_element(xferOp.mask(), ivI32).value; +} + +/// Helper function TransferOpConversion and TransferOp1dConversion. +/// Generate an in-bounds check if the transfer op may go out-of-bounds on the +/// specified dimension `dim` with the loop iteration variable `iv`. +/// E.g., when unpacking dimension 0 from: /// ``` -/// vector_transfer_read(%m, %offsets, identity_map, %fill) : -/// memref<(leading_dims) x (major_dims) x (minor_dims) x type>, -/// vector<(major_dims) x (minor_dims) x type> +/// %vec = vector.transfer_read %A[%a, %b] %cst +/// : vector<5x4xf32>, memref /// ``` -/// -/// where rank(minor_dims) is the lower-level vector rank (e.g. 1 for LLVM or -/// higher). -/// -/// This is the entry point to emitting pseudo-IR resembling: -/// +/// An if check similar to this will be generated inside the loop: /// ``` -/// %tmp = alloc(): memref<(major_dims) x vector> -/// for (%ivs_major, {0}, {vector_shape}, {1}) { // (N-1)-D loop nest -/// if (any_of(%ivs_major + %offsets, <, major_dims)) { -/// %v = vector_transfer_read( -/// {%offsets_leading, %ivs_major + %offsets_major, %offsets_minor}, -/// %ivs_minor): -/// memref<(leading_dims) x (major_dims) x (minor_dims) x type>, -/// vector<(minor_dims) x type>; -/// store(%v, %tmp); -/// } else { -/// %v = splat(vector<(minor_dims) x type>, %fill) -/// store(%v, %tmp, %ivs_major); -/// } -/// } -/// %res = load(%tmp, %0): memref<(major_dims) x vector>): -// vector<(major_dims) x (minor_dims) x type> +/// %d = memref.dim %A, %c0 : memref +/// if (%a + iv < %d) { +/// (in-bounds case) +/// } else { +/// (out-of-bounds case) +/// } /// ``` /// -template -class NDTransferOpHelper { -public: - NDTransferOpHelper(PatternRewriter &rewriter, ConcreteOp xferOp, - const VectorTransferToSCFOptions &options) - : rewriter(rewriter), options(options), loc(xferOp.getLoc()), - scope(std::make_unique(rewriter, loc)), xferOp(xferOp), - op(xferOp.getOperation()) { - vectorType = xferOp.getVectorType(); - // TODO: when we go to k > 1-D vectors adapt minorRank. - minorRank = 1; - majorRank = vectorType.getRank() - minorRank; - leadingRank = xferOp.getLeadingShapedRank(); - majorVectorType = - VectorType::get(vectorType.getShape().take_front(majorRank), - vectorType.getElementType()); - minorVectorType = - VectorType::get(vectorType.getShape().take_back(minorRank), - vectorType.getElementType()); - /// Memref of minor vector type is used for individual transfers. - memRefMinorVectorType = MemRefType::get( - majorVectorType.getShape(), minorVectorType, {}, - xferOp.getShapedType().template cast().getMemorySpace()); - } - - LogicalResult doReplace(); - -private: - /// Creates the loop nest on the "major" dimensions and calls the - /// `loopBodyBuilder` lambda in the context of the loop nest. - void - emitLoops(llvm::function_ref - loopBodyBuilder); - - /// Common state to lower vector transfer ops. - PatternRewriter &rewriter; - const VectorTransferToSCFOptions &options; - Location loc; - std::unique_ptr scope; - ConcreteOp xferOp; - Operation *op; - // A vector transfer copies data between: - // - memref<(leading_dims) x (major_dims) x (minor_dims) x type> - // - vector<(major_dims) x (minor_dims) x type> - unsigned minorRank; // for now always 1 - unsigned majorRank; // vector rank - minorRank - unsigned leadingRank; // memref rank - vector rank - VectorType vectorType; // vector<(major_dims) x (minor_dims) x type> - VectorType majorVectorType; // vector<(major_dims) x type> - VectorType minorVectorType; // vector<(minor_dims) x type> - MemRefType memRefMinorVectorType; // memref> -}; +/// If the transfer is 1D and has a mask, this function generates a more complex +/// check also accounts for potentially masked out elements. +/// +/// This function variant returns the value returned by `inBoundsCase` or +/// `outOfBoundsCase`. The MLIR type of the return value must be specified in +/// `resultTypes`. +template +static Value generateInBoundsCheck( + OpTy xferOp, Value iv, OpBuilder &builder, Optional dim, + TypeRange resultTypes, + function_ref inBoundsCase, + function_ref outOfBoundsCase = nullptr) { + bool hasRetVal = !resultTypes.empty(); + Value cond; // Condition to be built... -template -void NDTransferOpHelper::emitLoops( - llvm::function_ref - loopBodyBuilder) { - /// Loop nest operates on the major dimensions - MemRefBoundsCapture memrefBoundsCapture(xferOp.source()); + // Condition check 1: Access in-bounds? + bool isBroadcast = !dim.hasValue(); // No in-bounds check for broadcasts. + if (!xferOp.isDimInBounds(0) && !isBroadcast) { + auto memrefDim = + memref_dim(xferOp.source(), std_constant_index(dim.getValue())); + using edsc::op::operator+; + auto memrefIdx = xferOp.indices()[dim.getValue()] + iv; + cond = std_cmpi_sgt(memrefDim.value, memrefIdx); + } - if (options.unroll) { - auto shape = majorVectorType.getShape(); - auto strides = computeStrides(shape); - unsigned numUnrolledInstances = computeMaxLinearIndex(shape); - ValueRange indices(xferOp.indices()); - for (unsigned idx = 0; idx < numUnrolledInstances; ++idx) { - SmallVector offsets = delinearize(strides, idx); - SmallVector offsetValues = - llvm::to_vector<4>(llvm::map_range(offsets, [](int64_t off) -> Value { - return std_constant_index(off); - })); - loopBodyBuilder(offsetValues, indices.take_front(leadingRank), - indices.drop_front(leadingRank).take_front(majorRank), - indices.take_back(minorRank), memrefBoundsCapture); + // Condition check 2: Masked in? + if (auto maskCond = generateMaskCheck(builder, xferOp, iv)) { + if (cond) { + cond = builder.create(xferOp.getLoc(), cond, maskCond); + } else { + cond = maskCond; } - } else { - VectorBoundsCapture vectorBoundsCapture(majorVectorType); - auto majorLbs = vectorBoundsCapture.getLbs(); - auto majorUbs = vectorBoundsCapture.getUbs(); - auto majorSteps = vectorBoundsCapture.getSteps(); - affineLoopNestBuilder( - majorLbs, majorUbs, majorSteps, [&](ValueRange majorIvs) { - ValueRange indices(xferOp.indices()); - loopBodyBuilder(majorIvs, indices.take_front(leadingRank), - indices.drop_front(leadingRank).take_front(majorRank), - indices.take_back(minorRank), memrefBoundsCapture); + } + + // If the condition is non-empty, generate an SCF::IfOp. + if (cond) { + auto check = builder.create( + xferOp.getLoc(), resultTypes, cond, + /*thenBuilder=*/ + [&](OpBuilder &builder, Location loc) { + maybeYieldValue(hasRetVal, builder, loc, inBoundsCase(builder, loc)); + }, + /*elseBuilder=*/ + [&](OpBuilder &builder, Location loc) { + if (outOfBoundsCase) { + maybeYieldValue(hasRetVal, builder, loc, + outOfBoundsCase(builder, loc)); + } else { + builder.create(loc); + } }); + + return hasRetVal ? check.getResult(0) : Value(); } + + // Condition is empty, no need for an SCF::IfOp. + return inBoundsCase(builder, xferOp.getLoc()); } -static Optional extractConstantIndex(Value v) { - if (auto cstOp = v.getDefiningOp()) - return cstOp.getValue(); - if (auto affineApplyOp = v.getDefiningOp()) - if (affineApplyOp.getAffineMap().isSingleConstant()) - return affineApplyOp.getAffineMap().getSingleConstantResult(); - return None; +/// In this function variant, `inBoundsCase` and `outOfBoundsCase` do not have +/// a return value. Consequently, this function does not have a return value. +template +static void generateInBoundsCheck( + OpTy xferOp, Value iv, OpBuilder &builder, Optional dim, + function_ref inBoundsCase, + function_ref outOfBoundsCase = nullptr) { + generateInBoundsCheck( + xferOp, iv, builder, dim, /*resultTypes=*/TypeRange(), + /*inBoundsCase=*/ + [&](OpBuilder &builder, Location loc) { + inBoundsCase(builder, loc); + return Value(); + }, + /*outOfBoundsCase=*/ + [&](OpBuilder &builder, Location loc) { + if (outOfBoundsCase) + outOfBoundsCase(builder, loc); + return Value(); + }); } -// Missing foldings of scf.if make it necessary to perform poor man's folding -// eagerly, especially in the case of unrolling. In the future, this should go -// away once scf.if folds properly. -static Value onTheFlyFoldSLT(Value v, Value ub) { - using namespace mlir::edsc::op; - auto maybeCstV = extractConstantIndex(v); - auto maybeCstUb = extractConstantIndex(ub); - if (maybeCstV && maybeCstUb && *maybeCstV < *maybeCstUb) - return Value(); - return slt(v, ub); +/// Given an ArrayAttr, return a copy where the first element is dropped. +static ArrayAttr dropFirstElem(OpBuilder &builder, ArrayAttr attr) { + if (!attr) + return attr; + return ArrayAttr::get(builder.getContext(), attr.getValue().drop_front()); } -/// 1. Compute the indexings `majorIvs + majorOffsets` and save them in -/// `majorIvsPlusOffsets`. -/// 2. Return a value of i1 that determines whether the first -/// `majorIvs.rank()` -/// dimensions `majorIvs + majorOffsets` are all within `memrefBounds`. -static Value -emitInBoundsCondition(PatternRewriter &rewriter, - VectorTransferOpInterface xferOp, unsigned leadingRank, - ValueRange majorIvs, ValueRange majorOffsets, - const MemRefBoundsCapture &memrefBounds, - SmallVectorImpl &majorIvsPlusOffsets) { - Value inBoundsCondition; - majorIvsPlusOffsets.reserve(majorIvs.size()); - unsigned idx = 0; - SmallVector bounds = - applyMapToValues(rewriter, xferOp.getLoc(), xferOp.permutation_map(), - memrefBounds.getUbs()); - for (auto it : llvm::zip(majorIvs, majorOffsets, bounds)) { - Value iv = std::get<0>(it), off = std::get<1>(it), ub = std::get<2>(it); - using namespace mlir::edsc::op; - majorIvsPlusOffsets.push_back(iv + off); - auto affineConstExpr = - xferOp.permutation_map().getResult(idx).dyn_cast(); - bool isBroadcast = affineConstExpr && affineConstExpr.getValue() == 0; - if (!xferOp.isDimInBounds(leadingRank + idx) && !isBroadcast) { - Value inBoundsCond = onTheFlyFoldSLT(majorIvsPlusOffsets.back(), ub); - if (inBoundsCond) - inBoundsCondition = (inBoundsCondition) - ? (inBoundsCondition && inBoundsCond) - : inBoundsCond; - } - ++idx; - } - return inBoundsCondition; +/// Add the pass label to a vector transfer op if its rank is not the target +/// rank. +template +static void maybeApplyPassLabel(OpBuilder &builder, OpTy newXferOp) { + if (newXferOp.getVectorType().getRank() > kTargetRank) + newXferOp->setAttr(kPassLabel, builder.getUnitAttr()); } -// TODO: Parallelism and threadlocal considerations. -static Value setAllocAtFunctionEntry(MemRefType memRefMinorVectorType, - Operation *op) { - auto &b = ScopedContext::getBuilderRef(); - OpBuilder::InsertionGuard guard(b); - Operation *scope = - op->getParentWithTrait(); - assert(scope && "Expected op to be inside automatic allocation scope"); - b.setInsertionPointToStart(&scope->getRegion(0).front()); - Value res = memref_alloca(memRefMinorVectorType); - return res; +/// Given a transfer op, find the memref from which the mask is loaded. This +/// is similar to Strategy::getBuffer. +template +static Value getMaskBuffer(OpTy xferOp) { + assert(xferOp.mask() && "Expected that transfer op has mask"); + auto loadOp = xferOp.mask().template getDefiningOp(); + assert(loadOp && "Expected transfer op mask produced by LoadOp"); + return loadOp.getMemRef(); } +/// Codegen strategy, depending on the operation. +template +struct Strategy; + +/// Code strategy for vector TransferReadOp. template <> -LogicalResult NDTransferOpHelper::doReplace() { - Value alloc, result; - if (options.unroll) - result = std_splat(vectorType, xferOp.padding()); - else - alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op); - - emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets, - ValueRange majorOffsets, ValueRange minorOffsets, - const MemRefBoundsCapture &memrefBounds) { - /// Lambda to load 1-D vector in the current loop ivs + offset context. - auto load1DVector = [&](ValueRange majorIvsPlusOffsets) -> Value { - SmallVector indexing; - indexing.reserve(leadingRank + majorRank + minorRank); - indexing.append(leadingOffsets.begin(), leadingOffsets.end()); - indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end()); - indexing.append(minorOffsets.begin(), minorOffsets.end()); - Value memref = xferOp.source(); - auto map = - getTransferMinorIdentityMap(xferOp.getShapedType(), minorVectorType); - ArrayAttr inBounds; - if (xferOp.isDimInBounds(xferOp.getVectorType().getRank() - 1)) { - OpBuilder &b = ScopedContext::getBuilderRef(); - inBounds = b.getBoolArrayAttr({true}); - } - return vector_transfer_read(minorVectorType, memref, indexing, - AffineMapAttr::get(map), xferOp.padding(), - inBounds); - }; - - // 1. Compute the inBoundsCondition in the current loops ivs + offset - // context. - SmallVector majorIvsPlusOffsets; - Value inBoundsCondition = emitInBoundsCondition( - rewriter, cast(xferOp.getOperation()), - leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets); - - if (inBoundsCondition) { - // 2. If the condition is not null, we need an IfOp, which may yield - // if `options.unroll` is true. - SmallVector resultType; - if (options.unroll) - resultType.push_back(vectorType); - - // 3. If in-bounds, progressively lower to a 1-D transfer read, otherwise - // splat a 1-D vector. - ValueRange ifResults = conditionBuilder( - resultType, inBoundsCondition, - [&]() -> scf::ValueVector { - Value vector = load1DVector(majorIvsPlusOffsets); - // 3.a. If `options.unroll` is true, insert the 1-D vector in the - // aggregate. We must yield and merge with the `else` branch. - if (options.unroll) { - vector = vector_insert(vector, result, majorIvs); - return {vector}; - } - // 3.b. Otherwise, just go through the temporary `alloc`. - memref_store(vector, alloc, majorIvs); - return {}; - }, - [&]() -> scf::ValueVector { - Value vector = std_splat(minorVectorType, xferOp.padding()); - // 3.c. If `options.unroll` is true, insert the 1-D vector in the - // aggregate. We must yield and merge with the `then` branch. - if (options.unroll) { - vector = vector_insert(vector, result, majorIvs); - return {vector}; - } - // 3.d. Otherwise, just go through the temporary `alloc`. - memref_store(vector, alloc, majorIvs); - return {}; - }); +struct Strategy { + /// Find the StoreOp that is used for writing the current TransferReadOp's + /// result to the temporary buffer allocation. + static memref::StoreOp getStoreOp(TransferReadOp xferOp) { + assert(xferOp->hasOneUse() && "Expected exactly one use of TransferReadOp"); + auto storeOp = dyn_cast((*xferOp->use_begin()).getOwner()); + assert(storeOp && "Expected TransferReadOp result used by StoreOp"); + return storeOp; + } - if (!resultType.empty()) - result = *ifResults.begin(); - } else { - // 4. Guaranteed in-bounds, progressively lower to a 1-D transfer read. - Value loaded1D = load1DVector(majorIvsPlusOffsets); - // 5.a. If `options.unroll` is true, insert the 1-D vector in the - // aggregate. - if (options.unroll) - result = vector_insert(loaded1D, result, majorIvs); - // 5.b. Otherwise, just go through the temporary `alloc`. - else - memref_store(loaded1D, alloc, majorIvs); - } - }); + /// Find the temporary buffer allocation. All labeled TransferReadOps are + /// used like this, where %buf is either the buffer allocation or a type cast + /// of the buffer allocation: + /// ``` + /// %vec = vector.transfer_read ... { __vector_to_scf_lowering__ } ... + /// memref.store %vec, %buf[...] ... + /// ``` + static Value getBuffer(TransferReadOp xferOp) { + return getStoreOp(xferOp).getMemRef(); + } - assert((!options.unroll ^ (bool)result) && - "Expected resulting Value iff unroll"); - if (!result) - result = - memref_load(vector_type_cast(MemRefType::get({}, vectorType), alloc)); - rewriter.replaceOp(op, result); + /// Retrieve the indices of the current StoreOp that stores into the buffer. + static void getBufferIndices(TransferReadOp xferOp, + SmallVector &indices) { + auto storeOp = getStoreOp(xferOp); + auto prevIndices = memref::StoreOpAdaptor(storeOp).indices(); + indices.append(prevIndices.begin(), prevIndices.end()); + } - return success(); -} + /// Rewrite the TransferReadOp, assuming that there are no out-of-bounds + /// accesses on the to-be-unpacked dimension. + /// + /// 1. Generate a new (N-1)-d TransferReadOp using the loop iteration + /// variable `iv`. + /// 2. Store the result into the (already `vector.type_cast`ed) buffer. + /// + /// E.g.: + /// ``` + /// %vec = vector.transfer_read %A[%a+%i, %b, %c], %cst + /// : memref, vector<4x3xf32> + /// memref.store %vec, %buf[%i] : memref<5xvector<4x3xf32>> + /// ``` + /// Is rewritten to: + /// ``` + /// %casted = vector.type_cast %buf + /// : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>> + /// for %j = 0 to 4 { + /// %vec = vector.transfer_read %A[%a+%i, %b+%j, %c], %cst + /// : memref, vector<3xf32> + /// memref.store %vec, %casted[%i, %j] : memref<5x4xvector<3xf32>> + /// } + /// ``` + /// + /// Note: The loop and type cast are generated in TransferOpConversion. + /// The original TransferReadOp and store op are deleted in `cleanup`. + /// Note: The `mask` operand is set in TransferOpConversion. + static TransferReadOp rewriteOp(OpBuilder &builder, TransferReadOp xferOp, + Value buffer, Value iv) { + SmallVector storeIndices; + getBufferIndices(xferOp, storeIndices); + storeIndices.push_back(iv); + + SmallVector xferIndices; + getXferIndices(xferOp, iv, xferIndices); + + auto bufferType = buffer.getType().dyn_cast(); + auto vecType = bufferType.getElementType().dyn_cast(); + auto inBoundsAttr = dropFirstElem(builder, xferOp.in_boundsAttr()); + auto newXfer = + vector_transfer_read( + vecType, xferOp.source(), xferIndices, + AffineMapAttr::get(unpackedPermutationMap(xferOp, builder)), + xferOp.padding(), Value(), inBoundsAttr) + .value; + + maybeApplyPassLabel(builder, + dyn_cast(newXfer.getDefiningOp())); + + memref_store(newXfer, buffer, storeIndices); + return newXfer.getDefiningOp(); + } + + /// Handle out-of-bounds accesses on the to-be-unpacked dimension: Write + /// padding value to the temporary buffer. + static void handleOutOfBoundsDim(OpBuilder & /*builder*/, + TransferReadOp xferOp, Value buffer, + Value iv) { + SmallVector storeIndices; + getBufferIndices(xferOp, storeIndices); + storeIndices.push_back(iv); + + auto bufferType = buffer.getType().dyn_cast(); + auto vecType = bufferType.getElementType().dyn_cast(); + auto vec = std_splat(vecType, xferOp.padding()); + memref_store(vec, buffer, storeIndices); + } + + /// Cleanup after rewriting the op. + static void cleanup(PatternRewriter &rewriter, TransferReadOp xferOp) { + rewriter.eraseOp(getStoreOp(xferOp)); + rewriter.eraseOp(xferOp); + } +}; +/// Codegen strategy for vector TransferWriteOp. template <> -LogicalResult NDTransferOpHelper::doReplace() { - Value alloc; - if (!options.unroll) { - alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op); - memref_store(xferOp.vector(), - vector_type_cast(MemRefType::get({}, vectorType), alloc)); - } - - emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets, - ValueRange majorOffsets, ValueRange minorOffsets, - const MemRefBoundsCapture &memrefBounds) { - // Lower to 1-D vector_transfer_write and let recursion handle it. - auto emitTransferWrite = [&](ValueRange majorIvsPlusOffsets) { - SmallVector indexing; - indexing.reserve(leadingRank + majorRank + minorRank); - indexing.append(leadingOffsets.begin(), leadingOffsets.end()); - indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end()); - indexing.append(minorOffsets.begin(), minorOffsets.end()); - Value result; - // If `options.unroll` is true, extract the 1-D vector from the - // aggregate. - if (options.unroll) - result = vector_extract(xferOp.vector(), majorIvs); - else - result = memref_load(alloc, majorIvs); - auto map = - getTransferMinorIdentityMap(xferOp.getShapedType(), minorVectorType); - ArrayAttr inBounds; - if (xferOp.isDimInBounds(xferOp.getVectorType().getRank() - 1)) { - OpBuilder &b = ScopedContext::getBuilderRef(); - inBounds = b.getBoolArrayAttr({true}); - } - vector_transfer_write(result, xferOp.source(), indexing, - AffineMapAttr::get(map), inBounds); - }; - - // 1. Compute the inBoundsCondition in the current loops ivs + offset - // context. - SmallVector majorIvsPlusOffsets; - Value inBoundsCondition = emitInBoundsCondition( - rewriter, cast(xferOp.getOperation()), - leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets); - - if (inBoundsCondition) { - // 2.a. If the condition is not null, we need an IfOp, to write - // conditionally. Progressively lower to a 1-D transfer write. - conditionBuilder(inBoundsCondition, - [&] { emitTransferWrite(majorIvsPlusOffsets); }); - } else { - // 2.b. Guaranteed in-bounds. Progressively lower to a 1-D transfer write. - emitTransferWrite(majorIvsPlusOffsets); - } - }); +struct Strategy { + /// Find the temporary buffer allocation. All labeled TransferWriteOps are + /// used like this, where %buf is either the buffer allocation or a type cast + /// of the buffer allocation: + /// ``` + /// %vec = memref.load %buf[...] ... + /// vector.transfer_write %vec ... { __vector_to_scf_lowering__ } ... + /// ``` + static Value getBuffer(TransferWriteOp xferOp) { + auto loadOp = xferOp.vector().getDefiningOp(); + assert(loadOp && "Expected transfer op vector produced by LoadOp"); + return loadOp.getMemRef(); + } + + /// Retrieve the indices of the current LoadOp that loads from the buffer. + static void getBufferIndices(TransferWriteOp xferOp, + SmallVector &indices) { + auto loadOp = xferOp.vector().getDefiningOp(); + auto prevIndices = memref::LoadOpAdaptor(loadOp).indices(); + indices.append(prevIndices.begin(), prevIndices.end()); + } + + /// Rewrite the TransferWriteOp, assuming that there are no out-of-bounds + /// accesses on the to-be-unpacked dimension. + /// + /// 1. Load an (N-1)-d vector from the (already `vector.type_cast`ed) buffer, + /// using the loop iteration variable `iv`. + /// 2. Generate a new (N-1)-d TransferWriteOp, writing the loaded vector back + /// to memory. + /// + /// Note: For more details, see comments on Strategy. + static TransferWriteOp rewriteOp(OpBuilder &builder, TransferWriteOp xferOp, + Value buffer, Value iv) { + SmallVector loadIndices; + getBufferIndices(xferOp, loadIndices); + loadIndices.push_back(iv); + + SmallVector xferIndices; + getXferIndices(xferOp, iv, xferIndices); - rewriter.eraseOp(op); + auto vec = memref_load(buffer, loadIndices); + auto inBoundsAttr = dropFirstElem(builder, xferOp.in_boundsAttr()); + auto newXfer = vector_transfer_write( + Type(), vec, xferOp.source(), xferIndices, + AffineMapAttr::get(unpackedPermutationMap(xferOp, builder)), Value(), + inBoundsAttr); + maybeApplyPassLabel(builder, newXfer.op); + + return newXfer; + } + + /// Handle out-of-bounds accesses on the to-be-unpacked dimension. + static void handleOutOfBoundsDim(OpBuilder &builder, TransferWriteOp xferOp, + Value buffer, Value iv) {} + + /// Cleanup after rewriting the op. + static void cleanup(PatternRewriter &rewriter, TransferWriteOp xferOp) { + rewriter.eraseOp(xferOp); + } +}; + +template +LogicalResult checkPrepareXferOp(OpTy xferOp) { + if (xferOp->hasAttr(kPassLabel)) + return failure(); + if (xferOp.getVectorType().getRank() <= kTargetRank) + return failure(); return success(); } -} // namespace +/// Prepare a TransferReadOp for progressive lowering. +/// +/// 1. Allocate a temporary buffer. +/// 2. Label the TransferReadOp, marking it eligible for progressive lowering. +/// 3. Store the result of the TransferReadOp into the temporary buffer. +/// 4. Load the result from the temporary buffer and replace all uses of the +/// original TransferReadOp with this load. +/// +/// E.g.: +/// ``` +/// %vec = vector.transfer_read %A[%a, %b, %c], %cst +/// : vector<5x4xf32>, memref +/// ``` +/// is rewritten to: +/// ``` +/// %0 = memref.alloca() : memref> +/// %1 = vector.transfer_read %A[%a, %b, %c], %cst +/// { __vector_to_scf_lowering__ } : vector<5x4xf32>, memref +/// memref.store %1, %0[] : memref> +/// %vec = memref.load %0[] : memref> +/// ``` +/// +/// Note: A second temporary buffer may be allocated for the `mask` operand. +struct PrepareTransferReadConversion : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; -/// Analyzes the `transfer` to find an access dimension along the fastest remote -/// MemRef dimension. If such a dimension with coalescing properties is found, -/// `pivs` and `vectorBoundsCapture` are swapped so that the invocation of -/// LoopNestBuilder captures it in the innermost loop. -template -static int computeCoalescedIndex(TransferOpTy transfer) { - // rank of the remote memory access, coalescing behavior occurs on the - // innermost memory dimension. - auto remoteRank = transfer.getShapedType().getRank(); - // Iterate over the results expressions of the permutation map to determine - // the loop order for creating pointwise copies between remote and local - // memories. - int coalescedIdx = -1; - auto exprs = transfer.permutation_map().getResults(); - for (auto en : llvm::enumerate(exprs)) { - auto dim = en.value().template dyn_cast(); - if (!dim) { - continue; + LogicalResult matchAndRewrite(TransferReadOp xferOp, + PatternRewriter &rewriter) const override { + if (checkPrepareXferOp(xferOp).failed()) + return failure(); + + ScopedContext scope(rewriter, xferOp.getLoc()); + auto buffers = allocBuffers(xferOp); + auto *newXfer = rewriter.clone(*xferOp.getOperation()); + newXfer->setAttr(kPassLabel, rewriter.getUnitAttr()); + if (xferOp.mask()) { + dyn_cast(newXfer).maskMutable().assign( + buffers.maskBuffer); } - auto memRefDim = dim.getPosition(); - if (memRefDim == remoteRank - 1) { - // memRefDim has coalescing properties, it should be swapped in the last - // position. - assert(coalescedIdx == -1 && "Unexpected > 1 coalesced indices"); - coalescedIdx = en.index(); + + memref_store(newXfer->getResult(0), buffers.dataBuffer); + rewriter.replaceOpWithNewOp(xferOp, buffers.dataBuffer); + + return success(); + } +}; + +/// Prepare a TransferWriteOp for progressive lowering. +/// +/// 1. Allocate a temporary buffer. +/// 2. Store the vector into the buffer. +/// 3. Load the vector from the buffer again. +/// 4. Use the loaded vector as a TransferWriteOp operand and label the op, +/// marking it eligible for progressive lowering via TransferOpConversion. +/// +/// E.g.: +/// ``` +/// vector.transfer_write %vec, %A[%a, %b, %c] +/// : vector<5x4xf32>, memref +/// ``` +/// is rewritten to: +/// ``` +/// %0 = memref.alloca() : memref> +/// memref.store %vec, %0[] : memref> +/// %1 = memref.load %0[] : memref> +/// vector.transfer_write %1, %A[%a, %b, %c] { __vector_to_scf_lowering__ } +/// : vector<5x4xf32>, memref +/// ``` +/// +/// Note: A second temporary buffer may be allocated for the `mask` operand. +struct PrepareTransferWriteConversion + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(TransferWriteOp xferOp, + PatternRewriter &rewriter) const override { + if (checkPrepareXferOp(xferOp).failed()) + return failure(); + + ScopedContext scope(rewriter, xferOp.getLoc()); + auto buffers = allocBuffers(xferOp); + memref_store(xferOp.vector(), buffers.dataBuffer); + auto loadedVec = memref_load(buffers.dataBuffer); + rewriter.updateRootInPlace(xferOp, [&]() { + xferOp.vectorMutable().assign(loadedVec); + xferOp->setAttr(kPassLabel, rewriter.getUnitAttr()); + }); + + if (xferOp.mask()) { + rewriter.updateRootInPlace( + xferOp, [&]() { xferOp.maskMutable().assign(buffers.maskBuffer); }); } + + return success(); } - return coalescedIdx; -} +}; -template -VectorTransferRewriter::VectorTransferRewriter( - VectorTransferToSCFOptions options, MLIRContext *context) - : RewritePattern(TransferOpTy::getOperationName(), 1, context), - options(options) {} - -/// Used for staging the transfer in a local buffer. -template -MemRefType VectorTransferRewriter::tmpMemRefType( - TransferOpTy transfer) const { - auto vectorType = transfer.getVectorType(); - return MemRefType::get(vectorType.getShape().drop_back(), - VectorType::get(vectorType.getShape().take_back(), - vectorType.getElementType()), - {}, 0); -} +/// Progressive lowering of vector transfer ops: Unpack one dimension. +/// +/// 1. Unpack one dimension from the current buffer type and cast the buffer +/// to that new type. E.g.: +/// ``` +/// %vec = memref.load %0[%1] : memref<5xvector<4x3xf32>> +/// vector.transfer_write %vec ... +/// ``` +/// The following cast is generated: +/// ``` +/// %casted = vector.type_cast %0 +/// : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>> +/// ``` +/// 2. Generate a for loop and rewrite the transfer op according to the +/// corresponding Strategy. If the to-be-unpacked dimension can be +/// out-of-bounds, generate an if-check and handle both cases separately. +/// 3. Clean up according to the corresponding Strategy. +template +struct TransferOpConversion : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(OpTy xferOp, + PatternRewriter &rewriter) const override { + if (!xferOp->hasAttr(kPassLabel)) + return failure(); + + ScopedContext scope(rewriter, xferOp.getLoc()); + + // Find and cast data buffer. How the buffer can be found depends on OpTy. + auto dataBuffer = Strategy::getBuffer(xferOp); + auto dataBufferType = dataBuffer.getType().template dyn_cast(); + auto castedDataType = unpackOneDim(dataBufferType); + auto castedDataBuffer = vector_type_cast(castedDataType, dataBuffer); -static void emitWithBoundsChecks( - PatternRewriter &rewriter, VectorTransferOpInterface transfer, - ValueRange ivs, const MemRefBoundsCapture &memRefBoundsCapture, - function_ref)> inBoundsFun, - function_ref)> outOfBoundsFun = nullptr) { - // Permute the incoming indices according to the permutation map. - SmallVector indices = - applyMapToValues(rewriter, transfer.getLoc(), transfer.permutation_map(), - transfer.indices()); - - // Generate a bounds check if necessary. - SmallVector majorIvsPlusOffsets; - Value inBoundsCondition = - emitInBoundsCondition(rewriter, transfer, 0, ivs, indices, - memRefBoundsCapture, majorIvsPlusOffsets); - - // Apply the permutation map to the ivs. The permutation map may not use all - // the inputs. - SmallVector scalarAccessExprs(transfer.indices().size()); - for (unsigned memRefDim = 0; memRefDim < transfer.indices().size(); - ++memRefDim) { - // Linear search on a small number of entries. - int loopIndex = -1; - auto exprs = transfer.permutation_map().getResults(); - for (auto en : llvm::enumerate(exprs)) { - auto expr = en.value(); - auto dim = expr.dyn_cast(); - // Sanity check. - assert((dim || expr.cast().getValue() == 0) && - "Expected dim or 0 in permutationMap"); - if (dim && memRefDim == dim.getPosition()) { - loopIndex = en.index(); - break; + // If the xferOp has a mask: Find and cast mask buffer. + Value castedMaskBuffer; + if (xferOp.mask()) { + auto maskBuffer = getMaskBuffer(xferOp); + auto maskBufferType = + maskBuffer.getType().template dyn_cast(); + if (xferOp.isBroadcastDim(0) || xferOp.getMaskType().getRank() == 1) { + // Do not unpack a dimension of the mask, if: + // * To-be-unpacked transfer op dimension is a broadcast. + // * Mask is 1D, i.e., the mask cannot be further unpacked. + // (That means that all remaining dimensions of the transfer op must + // be broadcasted.) + castedMaskBuffer = maskBuffer; + } else { + auto castedMaskType = unpackOneDim(maskBufferType); + castedMaskBuffer = vector_type_cast(castedMaskType, maskBuffer); } } - using namespace edsc::op; - auto i = transfer.indices()[memRefDim]; - scalarAccessExprs[memRefDim] = loopIndex < 0 ? i : i + ivs[loopIndex]; - } - - if (inBoundsCondition) - conditionBuilder( - /* scf.if */ inBoundsCondition, // { - [&] { inBoundsFun(scalarAccessExprs); }, - // } else { - outOfBoundsFun ? [&] { outOfBoundsFun(scalarAccessExprs); } - : function_ref() - // } - ); - else - inBoundsFun(scalarAccessExprs); + // Loop bounds and step. + auto lb = std_constant_index(0).value; + auto ub = std_constant_index( + castedDataType.getDimSize(castedDataType.getRank() - 1)) + .value; + auto step = std_constant_index(1).value; + + // Generate for loop. + rewriter.create( + xferOp.getLoc(), lb, ub, step, ValueRange(), + [&](OpBuilder &b, Location loc, Value iv, ValueRange /*loopState*/) { + ScopedContext scope(b, loc); + generateInBoundsCheck( + xferOp, iv, b, unpackedDim(xferOp), + /*inBoundsCase=*/ + [&](OpBuilder &b, Location /*loc*/) { + // Create new transfer op. + OpTy newXfer = + Strategy::rewriteOp(b, xferOp, castedDataBuffer, iv); + + // If old transfer op has a mask: Set mask on new transfer op. + // Special case: If the mask of the old transfer op is 1D and + // the + // unpacked dim is not a broadcast, no mask is + // needed on the new transfer op. + if (xferOp.mask() && (xferOp.isBroadcastDim(0) || + xferOp.getMaskType().getRank() > 1)) { + OpBuilder::InsertionGuard guard(b); + b.setInsertionPoint(newXfer); // Insert load before newXfer. + + SmallVector loadIndices; + Strategy::getBufferIndices(xferOp, loadIndices); + // In case of broadcast: Use same indices to load from memref + // as before. + if (!xferOp.isBroadcastDim(0)) + loadIndices.push_back(iv); + + auto mask = memref_load(castedMaskBuffer, loadIndices); + rewriter.updateRootInPlace( + newXfer, [&]() { newXfer.maskMutable().assign(mask); }); + } + }, + /*outOfBoundsCase=*/ + [&](OpBuilder &b, Location /*loc*/) { + Strategy::handleOutOfBoundsDim(b, xferOp, + castedDataBuffer, iv); + }); + b.create(loc); + }); + + Strategy::cleanup(rewriter, xferOp); + return success(); + } +}; + +/// If the original transfer op has a mask, compute the mask of the new transfer +/// op (for the current iteration `i`) and assign it. +template +static void maybeAssignMask(OpBuilder &builder, OpTy xferOp, OpTy newXferOp, + int64_t i) { + if (!xferOp.mask()) + return; + + if (xferOp.isBroadcastDim(0)) { + // To-be-unpacked dimension is a broadcast, which does not have a + // corresponding mask dimension. Mask attribute remains unchanged. + newXferOp.maskMutable().assign(xferOp.mask()); + return; + } + + if (xferOp.getMaskType().getRank() > 1) { + // Unpack one dimension of the mask. + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPoint(newXferOp); // Insert load before newXfer. + + llvm::SmallVector indices({i}); + auto newMask = vector_extract(xferOp.mask(), indices).value; + newXferOp.maskMutable().assign(newMask); + } + + // If we end up here: The mask of the old transfer op is 1D and the unpacked + // dim is not a broadcast, so no mask is needed on the new transfer op. + // `generateInBoundsCheck` will have evaluated the mask already. } -namespace mlir { +/// Progressive lowering of vector TransferReadOp with unrolling: Unpack one +/// dimension. This is similar to TransferOpConversion, but no +/// memref buffer is allocated and the SCF loop is fully unrolled. +/// +/// ``` +/// E.g.: +/// ``` +/// %vec = vector.transfer_read %A[%a, %b, %c], %padding +/// : memref, vector<5x4xf32> +/// ``` +/// is rewritten to IR such as (simplified): +/// ``` +/// %v_init = splat %padding : vector<5x4xf32> +/// %tmp0 = vector.transfer_read %A[%a, %b, %c], %padding +/// : memref, vector<4xf32> +/// %v0 = vector.insert %tmp0, %v_init[0] : vector<4xf32> into vector<5x4xf32> +/// %tmp1 = vector.transfer_read %A[%a, %b + 1, %c], %padding +/// : memref, vector<4xf32> +/// %v1 = vector.insert %tmp1, %v0[1] : vector<4xf32> into vector<5x4xf32> +/// ... +/// %tmp4 = vector.transfer_read %A[%a, %b + 4, %c], %padding +/// : memref, vector<4xf32> +/// %vec = vector.insert %tmp1, %v3[4] : vector<4xf32> into vector<5x4xf32> +/// ``` +/// +/// Note: As an optimization, if the result of the original TransferReadOp +/// was directly inserted into another vector, no new %v_init vector is created. +/// Instead, the new TransferReadOp results are inserted into that vector. +struct UnrollTransferReadConversion : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + /// Return the vector into which the newly created TransferReadOp results + /// are inserted. + Value getResultVector(TransferReadOp xferOp, + PatternRewriter &rewriter) const { + if (auto insertOp = getInsertOp(xferOp)) + return insertOp.dest(); + return std_splat(xferOp.getVectorType(), xferOp.padding()).value; + } + + /// If the result of the TransferReadOp has exactly one user, which is a + /// vector::InsertOp, return that operation. + vector::InsertOp getInsertOp(TransferReadOp xferOp) const { + if (xferOp->hasOneUse()) { + Operation *xferOpUser = *xferOp->getUsers().begin(); + if (auto insertOp = dyn_cast(xferOpUser)) + return insertOp; + } + + return vector::InsertOp(); + } + + /// If the result of the TransferReadOp has exactly one user, which is a + /// vector::InsertOp, return that operation's indices. + void getInsertionIndices(TransferReadOp xferOp, + SmallVector &indices) const { + if (auto insertOp = getInsertOp(xferOp)) { + llvm::for_each(insertOp.position(), [&](Attribute attr) { + indices.push_back(attr.dyn_cast().getInt()); + }); + } + } + + /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds + /// accesses, and broadcasts and transposes in permutation maps. + LogicalResult matchAndRewrite(TransferReadOp xferOp, + PatternRewriter &rewriter) const override { + if (xferOp.getVectorType().getRank() <= kTargetRank) + return failure(); + + ScopedContext scope(rewriter, xferOp.getLoc()); + auto insertOp = getInsertOp(xferOp); + auto vec = getResultVector(xferOp, rewriter); + auto vecType = vec.getType().dyn_cast(); + auto xferVecType = xferOp.getVectorType(); + auto newXferVecType = VectorType::get(xferVecType.getShape().drop_front(), + xferVecType.getElementType()); + int64_t dimSize = xferVecType.getShape()[0]; + + // Generate fully unrolled loop of transfer ops. + for (int64_t i = 0; i < dimSize; ++i) { + Value iv = std_constant_index(i); + + vec = generateInBoundsCheck( + xferOp, iv, rewriter, unpackedDim(xferOp), TypeRange(vecType), + /*inBoundsCase=*/ + [&](OpBuilder &b, Location loc) { + ScopedContext scope(b, loc); + + // Indices for the new transfer op. + SmallVector xferIndices; + getXferIndices(xferOp, iv, xferIndices); + + // Indices for the new vector.insert op. + SmallVector insertionIndices; + getInsertionIndices(xferOp, insertionIndices); + insertionIndices.push_back(i); -/// Lowers TransferReadOp into a combination of: -/// 1. local memory allocation; -/// 2. perfect loop nest over: -/// a. scalar load from local buffers (viewed as a scalar memref); -/// a. scalar store to original memref (with padding). -/// 3. vector_load from local buffer (viewed as a memref<1 x vector>); -/// 4. local memory deallocation. + auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr()); + auto newXferOpVal = + vector_transfer_read( + newXferVecType, xferOp.source(), xferIndices, + AffineMapAttr::get(unpackedPermutationMap(xferOp, b)), + xferOp.padding(), Value(), inBoundsAttr) + .value; + auto newXferOp = + dyn_cast(newXferOpVal.getDefiningOp()); + + maybeAssignMask(b, xferOp, newXferOp, i); + + return vector_insert(newXferOp, vec, insertionIndices).value; + }, + /*outOfBoundsCase=*/ + [&](OpBuilder &b, Location loc) { + // Loop through original (unmodified) vector. + return vec; + }); + } + + if (insertOp) { + // Rewrite single user of the old TransferReadOp, which was an InsertOp. + rewriter.replaceOp(insertOp, vec); + rewriter.eraseOp(xferOp); + } else { + rewriter.replaceOp(xferOp, vec); + } + + return success(); + } +}; + +/// Progressive lowering of vector TransferWriteOp with unrolling: Unpack one +/// dimension. This is similar to TransferOpConversion, but no +/// memref buffer is allocated and the SCF loop is fully unrolled. +/// +/// ``` +/// E.g.: +/// ``` +/// vector.transfer_write %vec, %A[%a, %b, %c] +/// : vector<5x4xf32>, memref +/// ``` +/// is rewritten to IR such as (simplified): +/// ``` +/// %v0 = vector.extract %vec[0] : vector<5x4xf32> +/// vector.transfer_write %v0, %A[%a, %b, %c] : vector<4xf32>, memref<...> +/// %v1 = vector.extract %vec[1] : vector<5x4xf32> +/// vector.transfer_write %v1, %A[%a, %b + 1, %c] : vector<4xf32>, memref<...> +/// ... +/// %v4 = vector.extract %vec[4] : vector<5x4xf32> +/// vector.transfer_write %v4, %A[%a, %b + 4, %c] : vector<4xf32>, memref<...> +/// ``` /// -/// Lowers the data transfer part of a TransferReadOp while ensuring no -/// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by -/// padding. +/// Note: As an optimization, if the vector of the original TransferWriteOp +/// was directly extracted from another vector via an ExtractOp `a`, extract +/// the vectors for the newly generated TransferWriteOps from `a`'s input. By +/// doing so, `a` may become dead, and the number of ExtractOps generated during +/// recursive application of this pattern will be minimal. +struct UnrollTransferWriteConversion + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; -/// Performs the rewrite. -template <> -LogicalResult VectorTransferRewriter::matchAndRewrite( - Operation *op, PatternRewriter &rewriter) const { - using namespace mlir::edsc::op; + /// Return the vector from which newly generated ExtracOps will extract. + Value getDataVector(TransferWriteOp xferOp) const { + if (auto extractOp = getExtractOp(xferOp)) + return extractOp.vector(); + return xferOp.vector(); + } - TransferReadOp transfer = cast(op); - if (transfer.mask()) - return failure(); - auto memRefType = transfer.getShapedType().dyn_cast(); - if (!memRefType) - return failure(); - // Fall back to a loop if the fastest varying stride is not 1 or it is - // permuted. - int64_t offset; - SmallVector strides; - auto successStrides = getStridesAndOffset(memRefType, strides, offset); - if (succeeded(successStrides) && strides.back() == 1 && - transfer.permutation_map().isMinorIdentity()) { - // If > 1D, emit a bunch of loops around 1-D vector transfers. - if (transfer.getVectorType().getRank() > 1) - return NDTransferOpHelper(rewriter, transfer, options) - .doReplace(); - // If 1-D this is now handled by the target-specific lowering. - if (transfer.getVectorType().getRank() == 1) + /// If the input of the given TransferWriteOp is an ExtractOp, return it. + vector::ExtractOp getExtractOp(TransferWriteOp xferOp) const { + if (auto *op = xferOp.vector().getDefiningOp()) + return dyn_cast(op); + return vector::ExtractOp(); + } + + /// If the input of the given TransferWriteOp is an ExtractOp, return its + /// indices. + void getExtractionIndices(TransferWriteOp xferOp, + SmallVector &indices) const { + if (auto extractOp = getExtractOp(xferOp)) { + llvm::for_each(extractOp.position(), [&](Attribute attr) { + indices.push_back(attr.dyn_cast().getInt()); + }); + } + } + + /// Rewrite the op: Unpack one dimension. Can handle masks, out-of-bounds + /// accesses, and broadcasts and transposes in permutation maps. + LogicalResult matchAndRewrite(TransferWriteOp xferOp, + PatternRewriter &rewriter) const override { + if (xferOp.getVectorType().getRank() <= kTargetRank) return failure(); + + ScopedContext scope(rewriter, xferOp.getLoc()); + auto vec = getDataVector(xferOp); + auto xferVecType = xferOp.getVectorType(); + int64_t dimSize = xferVecType.getShape()[0]; + + // Generate fully unrolled loop of transfer ops. + for (int64_t i = 0; i < dimSize; ++i) { + Value iv = std_constant_index(i); + + generateInBoundsCheck( + xferOp, iv, rewriter, unpackedDim(xferOp), + /*inBoundsCase=*/[&](OpBuilder &b, Location loc) { + ScopedContext scope(b, loc); + + // Indices for the new transfer op. + SmallVector xferIndices; + getXferIndices(xferOp, iv, xferIndices); + + // Indices for the new vector.extract op. + SmallVector extractionIndices; + getExtractionIndices(xferOp, extractionIndices); + extractionIndices.push_back(i); + + auto extracted = vector_extract(vec, extractionIndices).value; + auto inBoundsAttr = dropFirstElem(b, xferOp.in_boundsAttr()); + + auto newXferOp = + vector_transfer_write( + Type(), extracted, xferOp.source(), xferIndices, + AffineMapAttr::get(unpackedPermutationMap(xferOp, b)), + Value(), inBoundsAttr) + .op; + + maybeAssignMask(b, xferOp, newXferOp, i); + }); + } + + rewriter.eraseOp(xferOp); + return success(); } +}; - // Conservative lowering to scalar load / stores. - // 1. Setup all the captures. - ScopedContext scope(rewriter, transfer.getLoc()); - MemRefIndexedValue remote(transfer.source()); - MemRefBoundsCapture memRefBoundsCapture(transfer.source()); - VectorBoundsCapture vectorBoundsCapture(transfer.vector()); - int coalescedIdx = computeCoalescedIndex(transfer); - // Swap the vectorBoundsCapture which will reorder loop bounds. - if (coalescedIdx >= 0) - vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1, - coalescedIdx); - - auto lbs = vectorBoundsCapture.getLbs(); - auto ubs = vectorBoundsCapture.getUbs(); - SmallVector steps; - steps.reserve(vectorBoundsCapture.getSteps().size()); - for (auto step : vectorBoundsCapture.getSteps()) - steps.push_back(std_constant_index(step)); - - // 2. Emit alloc-copy-load-dealloc. - MLIRContext *ctx = op->getContext(); - Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer); - MemRefIndexedValue local(tmp); - loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) { - auto ivsStorage = llvm::to_vector<8>(loopIvs); - // Swap the ivs which will reorder memory accesses. - if (coalescedIdx >= 0) - std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]); - - ArrayRef ivs(ivsStorage); - Value pos = std_index_cast(IntegerType::get(ctx, 32), ivs.back()); - Value inVector = local(ivs.drop_back()); - auto loadValue = [&](ArrayRef indices) { - Value vector = vector_insert_element(remote(indices), inVector, pos); - local(ivs.drop_back()) = vector; - }; - auto loadPadding = [&](ArrayRef) { - Value vector = vector_insert_element(transfer.padding(), inVector, pos); - local(ivs.drop_back()) = vector; - }; - emitWithBoundsChecks( - rewriter, cast(transfer.getOperation()), ivs, - memRefBoundsCapture, loadValue, loadPadding); - }); - Value vectorValue = memref_load(vector_type_cast(tmp)); - - // 3. Propagate. - rewriter.replaceOp(op, vectorValue); - return success(); +/// Compute the indices into the memref for the LoadOp/StoreOp generated as +/// part of TransferOp1dConversion. Return the memref dimension on which +/// the transfer is operating. A return value of None indicates a broadcast. +template +static Optional +get1dMemrefIndices(OpTy xferOp, Value iv, + SmallVector &memrefIndices) { + auto indices = xferOp.indices(); + auto map = xferOp.permutation_map(); + + memrefIndices.append(indices.begin(), indices.end()); + assert(map.getNumResults() == 1 && + "Expected 1 permutation map result for 1D transfer"); + if (auto expr = map.getResult(0).template dyn_cast()) { + auto dim = expr.getPosition(); + using edsc::op::operator+; + memrefIndices[dim] = memrefIndices[dim] + iv; + return dim; + } + + assert(xferOp.isBroadcastDim(0) && + "Expected AffineDimExpr or AffineConstantExpr"); + return None; } -/// Lowers TransferWriteOp into a combination of: -/// 1. local memory allocation; -/// 2. vector_store to local buffer (viewed as a memref<1 x vector>); -/// 3. perfect loop nest over: -/// a. scalar load from local buffers (viewed as a scalar memref); -/// a. scalar store to original memref (if in bounds). -/// 4. local memory deallocation. -/// -/// More specifically, lowers the data transfer part while ensuring no -/// out-of-bounds accesses are possible. +/// Codegen strategy for TransferOp1dConversion, depending on the +/// operation. +template +struct Strategy1d; + +/// Codegen strategy for TransferReadOp. template <> -LogicalResult VectorTransferRewriter::matchAndRewrite( - Operation *op, PatternRewriter &rewriter) const { - using namespace edsc::op; +struct Strategy1d { + static void generateForLoopBody(OpBuilder &builder, Location loc, + TransferReadOp xferOp, Value iv, + ValueRange loopState) { + SmallVector indices; + auto dim = get1dMemrefIndices(xferOp, iv, indices); + auto ivI32 = std_index_cast(IntegerType::get(builder.getContext(), 32), iv); + auto vec = loopState[0]; - TransferWriteOp transfer = cast(op); - if (transfer.mask()) - return failure(); - auto memRefType = transfer.getShapedType().template dyn_cast(); - if (!memRefType) - return failure(); + // In case of out-of-bounds access, leave `vec` as is (was initialized with + // padding value). + auto nextVec = generateInBoundsCheck( + xferOp, iv, builder, dim, TypeRange(xferOp.getVectorType()), + /*inBoundsCase=*/ + [&](OpBuilder & /*b*/, Location loc) { + auto val = memref_load(xferOp.source(), indices); + return vector_insert_element(val, vec, ivI32.value).value; + }, + /*outOfBoundsCase=*/ + [&](OpBuilder & /*b*/, Location loc) { return vec; }); + builder.create(loc, nextVec); + } + + static Value initialLoopState(TransferReadOp xferOp) { + // Inititalize vector with padding value. + return std_splat(xferOp.getVectorType(), xferOp.padding()).value; + } +}; + +/// Codegen strategy for TransferWriteOp. +template <> +struct Strategy1d { + static void generateForLoopBody(OpBuilder &builder, Location loc, + TransferWriteOp xferOp, Value iv, + ValueRange /*loopState*/) { + SmallVector indices; + auto dim = get1dMemrefIndices(xferOp, iv, indices); + auto ivI32 = std_index_cast(IntegerType::get(builder.getContext(), 32), iv); + + // Nothing to do in case of out-of-bounds access. + generateInBoundsCheck( + xferOp, iv, builder, dim, + /*inBoundsCase=*/[&](OpBuilder & /*b*/, Location loc) { + auto val = vector_extract_element(xferOp.vector(), ivI32.value); + memref_store(val, xferOp.source(), indices); + }); + builder.create(loc); + } - // Fall back to a loop if the fastest varying stride is not 1 or it is - // permuted. + static Value initialLoopState(TransferWriteOp xferOp) { return Value(); } +}; + +/// Return true if the last dimension of the MemRefType has unit stride. +static bool isLastMemrefDimUnitStride(MemRefType type) { int64_t offset; SmallVector strides; - auto successStrides = getStridesAndOffset(memRefType, strides, offset); - if (succeeded(successStrides) && strides.back() == 1 && - transfer.permutation_map().isMinorIdentity()) { - // If > 1D, emit a bunch of loops around 1-D vector transfers. - if (transfer.getVectorType().getRank() > 1) - return NDTransferOpHelper(rewriter, transfer, options) - .doReplace(); - // If 1-D this is now handled by the target-specific lowering. - if (transfer.getVectorType().getRank() == 1) + auto successStrides = getStridesAndOffset(type, strides, offset); + return succeeded(successStrides) && strides.back() == 1; +} + +/// Lower a 1D vector transfer op to SCF using scalar loads/stores. This is +/// necessary in cases where a 1D vector transfer op cannot be lowered into +/// vector load/stores due to non-unit strides or broadcasts: +/// +/// * Transfer dimension is not the last memref dimension +/// * Transfer dimension is a broadcast (i.e., scalar load + broadcast) +/// * Memref has a layout map with non-unit stride on the last dimension +/// +/// This pattern generates IR as follows: +/// +/// 1. Generate a for loop iterating over each vector element. +/// 2. Inside the loop, generate a InsertElementOp or ExtractElementOp, +/// depending on OpTy. +/// +/// TODO: In some cases (no masking, etc.), LLVM::MatrixColumnMajorLoadOp +/// can be generated instead of TransferOp1dConversion. Add such a pattern +/// to ConvertVectorToLLVM. +/// +/// E.g.: +/// ``` +/// vector.transfer_write %vec, %A[%a, %b] +/// {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]} +/// : vector<9xf32>, memref +/// ``` +/// Is rewritten to approximately the following pseudo-IR: +/// ``` +/// for i = 0 to 9 { +/// %t = vector.extractelement %vec[i] : vector<9xf32> +/// memref.store %t, %arg0[%a + i, %b] : memref +/// } +/// ``` +template +struct TransferOp1dConversion : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(OpTy xferOp, + PatternRewriter &rewriter) const override { + ScopedContext scope(rewriter, xferOp.getLoc()); + auto map = xferOp.permutation_map(); + auto memRefType = xferOp.getShapedType().template dyn_cast(); + + if (!memRefType) return failure(); + if (xferOp.getVectorType().getRank() != 1) + return failure(); + if (map.isMinorIdentity() && isLastMemrefDimUnitStride(memRefType)) + return failure(); // Handled by ConvertVectorToLLVM + + // Loop bounds, step, state... + auto vecType = xferOp.getVectorType(); + auto lb = std_constant_index(0); + auto ub = std_constant_index(vecType.getDimSize(0)); + auto step = std_constant_index(1); + auto loopState = Strategy1d::initialLoopState(xferOp); + + // Generate for loop. + rewriter.replaceOpWithNewOp( + xferOp, lb, ub, step, loopState ? ValueRange(loopState) : ValueRange(), + [&](OpBuilder &builder, Location loc, Value iv, ValueRange loopState) { + ScopedContext nestedScope(builder, loc); + Strategy1d::generateForLoopBody(builder, loc, xferOp, iv, + loopState); + }); + + return success(); } +}; - // 1. Setup all the captures. - ScopedContext scope(rewriter, transfer.getLoc()); - MemRefIndexedValue remote(transfer.source()); - MemRefBoundsCapture memRefBoundsCapture(transfer.source()); - Value vectorValue(transfer.vector()); - VectorBoundsCapture vectorBoundsCapture(transfer.vector()); - int coalescedIdx = computeCoalescedIndex(transfer); - // Swap the vectorBoundsCapture which will reorder loop bounds. - if (coalescedIdx >= 0) - vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1, - coalescedIdx); - - auto lbs = vectorBoundsCapture.getLbs(); - auto ubs = vectorBoundsCapture.getUbs(); - SmallVector steps; - steps.reserve(vectorBoundsCapture.getSteps().size()); - for (auto step : vectorBoundsCapture.getSteps()) - steps.push_back(std_constant_index(step)); - - // 2. Emit alloc-store-copy-dealloc. - Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer); - MemRefIndexedValue local(tmp); - Value vec = vector_type_cast(tmp); - memref_store(vectorValue, vec); - loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) { - auto ivsStorage = llvm::to_vector<8>(loopIvs); - // Swap the ivsStorage which will reorder memory accesses. - if (coalescedIdx >= 0) - std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]); - - ArrayRef ivs(ivsStorage); - Value pos = - std_index_cast(IntegerType::get(op->getContext(), 32), ivs.back()); - auto storeValue = [&](ArrayRef indices) { - Value scalar = vector_extract_element(local(ivs.drop_back()), pos); - remote(indices) = scalar; - }; - emitWithBoundsChecks( - rewriter, cast(transfer.getOperation()), ivs, - memRefBoundsCapture, storeValue); - }); - - // 3. Erase. - rewriter.eraseOp(op); - return success(); -} +} // namespace + +namespace mlir { void populateVectorToSCFConversionPatterns( RewritePatternSet &patterns, const VectorTransferToSCFOptions &options) { - patterns.add, - VectorTransferRewriter>( - options, patterns.getContext()); + if (options.unroll) { + patterns.add( + patterns.getContext()); + } else { + patterns.add, + TransferOpConversion>(patterns.getContext()); + } + + if (kTargetRank == 1) { + patterns.add, + TransferOp1dConversion>( + patterns.getContext()); + } } } // namespace mlir diff --git a/mlir/test/Conversion/VectorToSCF/progressive-vector-to-loops.mlir b/mlir/test/Conversion/VectorToSCF/progressive-vector-to-loops.mlir deleted file mode 100644 --- a/mlir/test/Conversion/VectorToSCF/progressive-vector-to-loops.mlir +++ /dev/null @@ -1,467 +0,0 @@ -// RUN: mlir-opt %s -test-progressive-convert-vector-to-scf -split-input-file -allow-unregistered-dialect | FileCheck %s -// RUN: mlir-opt %s -test-unrolled-progressive-convert-vector-to-scf -split-input-file -allow-unregistered-dialect | FileCheck %s --check-prefix=FULL-UNROLL - -// CHECK-LABEL: func @materialize_read_1d() { -func @materialize_read_1d() { - %f0 = constant 0.0: f32 - %A = memref.alloc () : memref<7x42xf32> - affine.for %i0 = 0 to 7 step 4 { - affine.for %i1 = 0 to 42 step 4 { - %f1 = vector.transfer_read %A[%i0, %i1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> - %ip1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i1) - %f2 = vector.transfer_read %A[%i0, %ip1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> - %ip2 = affine.apply affine_map<(d0) -> (d0 + 2)> (%i1) - %f3 = vector.transfer_read %A[%i0, %ip2], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> - %ip3 = affine.apply affine_map<(d0) -> (d0 + 3)> (%i1) - %f4 = vector.transfer_read %A[%i0, %ip3], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> - // Both accesses in the load must be clipped otherwise %i1 + 2 and %i1 + 3 will go out of bounds. - // CHECK: scf.if - // CHECK-NEXT: memref.load - // CHECK-NEXT: vector.insertelement - // CHECK-NEXT: scf.yield - // CHECK-NEXT: else - // CHECK-NEXT: scf.yield - // Add a dummy use to prevent dead code elimination from removing transfer - // read ops. - "dummy_use"(%f1, %f2, %f3, %f4) : (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> () - } - } - return -} - -// ----- - -// CHECK-LABEL: func @materialize_read_1d_partially_specialized -func @materialize_read_1d_partially_specialized(%dyn1 : index, %dyn2 : index, %dyn4 : index) { - %f0 = constant 0.0: f32 - %A = memref.alloc (%dyn1, %dyn2, %dyn4) : memref<7x?x?x42x?xf32> - affine.for %i0 = 0 to 7 { - affine.for %i1 = 0 to %dyn1 { - affine.for %i2 = 0 to %dyn2 { - affine.for %i3 = 0 to 42 step 2 { - affine.for %i4 = 0 to %dyn4 { - %f1 = vector.transfer_read %A[%i0, %i1, %i2, %i3, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32> - %i3p1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i3) - %f2 = vector.transfer_read %A[%i0, %i1, %i2, %i3p1, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32> - // Add a dummy use to prevent dead code elimination from removing - // transfer read ops. - "dummy_use"(%f1, %f2) : (vector<4xf32>, vector<4xf32>) -> () - } - } - } - } - } - // CHECK: %[[tensor:[0-9]+]] = memref.alloc - // CHECK-NOT: {{.*}} memref.dim %[[tensor]], %c0 - // CHECK-NOT: {{.*}} memref.dim %[[tensor]], %c3 - return -} - -// ----- - -// CHECK: #[[$ADD:map.*]] = affine_map<(d0, d1) -> (d0 + d1)> - -// CHECK-LABEL: func @materialize_read(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) { -func @materialize_read(%M: index, %N: index, %O: index, %P: index) { - %f0 = constant 0.0: f32 - // CHECK-DAG: %[[ALLOC:.*]] = memref.alloca() : memref> - // CHECK-DAG: %[[C0:.*]] = constant 0 : index - // CHECK-DAG: %[[C1:.*]] = constant 1 : index - // CHECK-DAG: %[[C3:.*]] = constant 3 : index - // CHECK-DAG: %[[C4:.*]] = constant 4 : index - // CHECK-DAG: %[[C5:.*]] = constant 5 : index - // CHECK: %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref - // CHECK-NEXT: affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 { - // CHECK-NEXT: affine.for %[[I1:.*]] = 0 to %{{.*}} { - // CHECK-NEXT: affine.for %[[I2:.*]] = 0 to %{{.*}} { - // CHECK-NEXT: affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 { - // CHECK: scf.for %[[I4:.*]] = %[[C0]] to %[[C5]] step %[[C1]] { - // CHECK: scf.if - // CHECK: %[[L3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I4]]) - // CHECK: scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { - // CHECK: %[[VEC:.*]] = scf.for %[[I6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<3xf32>) { - // CHECK: %[[L0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I6]]) - // CHECK: %[[VIDX:.*]] = index_cast %[[I6]] - // CHECK: scf.if {{.*}} -> (vector<3xf32>) { - // CHECK-NEXT: %[[SCAL:.*]] = memref.load %{{.*}}[%[[L0]], %[[I1]], %[[I2]], %[[L3]]] : memref - // CHECK-NEXT: %[[RVEC:.*]] = vector.insertelement %[[SCAL]], %{{.*}}[%[[VIDX]] : i32] : vector<3xf32> - // CHECK-NEXT: scf.yield - // CHECK-NEXT: } else { - // CHECK-NEXT: scf.yield - // CHECK-NEXT: } - // CHECK-NEXT: scf.yield - // CHECK-NEXT: } - // CHECK-NEXT: memref.store %[[VEC]], {{.*}} : memref<5x4xvector<3xf32>> - // CHECK-NEXT: } - // CHECK-NEXT: } else { - // CHECK-NEXT: memref.store {{.*}} : memref<5xvector<4x3xf32>> - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: %[[LD:.*]] = memref.load %[[ALLOC]][] : memref> - // CHECK-NEXT: "dummy_use"(%[[LD]]) : (vector<5x4x3xf32>) -> () - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: return - // CHECK-NEXT:} - - // Check that I0 + I4 (of size 3) read from first index load(L0, ...) and write into last index store(..., I4) - // Check that I3 + I6 (of size 5) read from last index load(..., L3) and write into first index store(I6, ...) - // Other dimensions are just accessed with I1, I2 resp. - %A = memref.alloc (%M, %N, %O, %P) : memref - affine.for %i0 = 0 to %M step 3 { - affine.for %i1 = 0 to %N { - affine.for %i2 = 0 to %O { - affine.for %i3 = 0 to %P step 5 { - %f = vector.transfer_read %A[%i0, %i1, %i2, %i3], %f0 {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, 0, d0)>} : memref, vector<5x4x3xf32> - // Add a dummy use to prevent dead code elimination from removing - // transfer read ops. - "dummy_use"(%f) : (vector<5x4x3xf32>) -> () - } - } - } - } - return -} - -// ----- - -// CHECK: #[[$ADD:map.*]] = affine_map<(d0, d1) -> (d0 + d1)> - -// CHECK-LABEL:func @materialize_write(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) { -func @materialize_write(%M: index, %N: index, %O: index, %P: index) { - // CHECK-DAG: %[[ALLOC:.*]] = memref.alloca() : memref> - // CHECK-DAG: %{{.*}} = constant dense<1.000000e+00> : vector<5x4x3xf32> - // CHECK-DAG: %[[C0:.*]] = constant 0 : index - // CHECK-DAG: %[[C1:.*]] = constant 1 : index - // CHECK-DAG: %[[C3:.*]] = constant 3 : index - // CHECK-DAG: %[[C4:.*]] = constant 4 : index - // CHECK-DAG: %[[C5:.*]] = constant 5 : index - // CHECK: %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref - // CHECK-NEXT: affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 { - // CHECK-NEXT: affine.for %[[I1:.*]] = 0 to %{{.*}} step 4 { - // CHECK-NEXT: affine.for %[[I2:.*]] = 0 to %{{.*}} { - // CHECK-NEXT: affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 { - // CHECK: memref.store %{{.*}}, %[[ALLOC]][] : memref> - // CHECK: %[[VECTOR_VIEW1:.*]] = vector.type_cast %[[ALLOC]] : memref> to memref<5xvector<4x3xf32>> - // CHECK: scf.for %[[I4:.*]] = %[[C0]] to %[[C5]] step %[[C1]] { - // CHECK: scf.if - // CHECK: %[[S3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I4]]) - // CHECK: %[[VECTOR_VIEW2:.*]] = vector.type_cast %[[VECTOR_VIEW1]] : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>> - // CHECK: scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { - // CHECK: scf.if - // CHECK: %[[S1:.*]] = affine.apply #[[$ADD]](%[[I1]], %[[I5]]) - // CHECK: %[[VEC:.*]] = memref.load %[[VECTOR_VIEW2]][%[[I4]], %[[I5]]] : memref<5x4xvector<3xf32>> - // CHECK: scf.for %[[I6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] { - // CHECK: %[[S0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I6]]) - // CHECK: %[[VIDX:.*]] = index_cast %[[I6]] - // CHECK: scf.if - // CHECK: %[[SCAL:.*]] = vector.extractelement %[[VEC]][%[[VIDX]] : i32] : vector<3xf32> - // CHECK: memref.store %[[SCAL]], {{.*}}[%[[S0]], %[[S1]], %[[I2]], %[[S3]]] : memref - // CHECK: } - // CHECK: } - // CHECK: } - // CHECK: } - // CHECK: } - // CHECK: } - // CHECK: } - // CHECK: } - // CHECK: } - // CHECK: } - // CHECK: return - - // Check that I0 + I4 (of size 3) read from last index load(..., I4) and write into first index store(S0, ...) - // Check that I1 + I5 (of size 4) read from second index load(..., I5, ...) and write into second index store(..., S1, ...) - // Check that I3 + I6 (of size 5) read from first index load(I6, ...) and write into last index store(..., S3) - // Other dimension is just accessed with I2. - %A = memref.alloc (%M, %N, %O, %P) : memref - %f1 = constant dense<1.000000e+00> : vector<5x4x3xf32> - affine.for %i0 = 0 to %M step 3 { - affine.for %i1 = 0 to %N step 4 { - affine.for %i2 = 0 to %O { - affine.for %i3 = 0 to %P step 5 { - vector.transfer_write %f1, %A[%i0, %i1, %i2, %i3] {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d1, d0)>} : vector<5x4x3xf32>, memref - } - } - } - } - return -} - -// ----- - -// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)> - -// FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)> -// FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)> - - -// CHECK-LABEL: transfer_read_progressive( -// CHECK-SAME: %[[A:[a-zA-Z0-9]+]]: memref, -// CHECK-SAME: %[[base:[a-zA-Z0-9]+]]: index - -// FULL-UNROLL-LABEL: transfer_read_progressive( -// FULL-UNROLL-SAME: %[[A:[a-zA-Z0-9]+]]: memref, -// FULL-UNROLL-SAME: %[[base:[a-zA-Z0-9]+]]: index - -func @transfer_read_progressive(%A : memref, %base: index) -> vector<3x15xf32> { - %f7 = constant 7.0: f32 - // CHECK-DAG: %[[C7:.*]] = constant 7.000000e+00 : f32 - // CHECK-DAG: %[[C0:.*]] = constant 0 : index - // CHECK-DAG: %[[C1:.*]] = constant 1 : index - // CHECK-DAG: %[[C3:.*]] = constant 3 : index - // CHECK-DAG: %[[splat:.*]] = constant dense<7.000000e+00> : vector<15xf32> - // CHECK-DAG: %[[alloc:.*]] = memref.alloca() : memref> - // CHECK: %[[alloc_casted:.*]] = vector.type_cast %[[alloc]] : memref> to memref<3xvector<15xf32>> - // CHECK: scf.for %[[I:.*]] = %[[C0]] to %[[C3]] - // CHECK: %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref - // CHECK: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]] - // CHECK: %[[cond1:.*]] = cmpi sgt, %[[dim]], %[[add]] : index - // CHECK: scf.if %[[cond1]] { - // CHECK: %[[vec_1d:.*]] = vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref, vector<15xf32> - // CHECK: memref.store %[[vec_1d]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>> - // CHECK: } else { - // CHECK: store %[[splat]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>> - // CHECK: } - // CHECK: } - // CHECK: %[[cst:.*]] = memref.load %[[alloc]][] : memref> - - // FULL-UNROLL: %[[C7:.*]] = constant 7.000000e+00 : f32 - // FULL-UNROLL: %[[VEC0:.*]] = constant dense<7.000000e+00> : vector<3x15xf32> - // FULL-UNROLL: %[[C0:.*]] = constant 0 : index - // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref - // FULL-UNROLL: cmpi sgt, %[[DIM]], %[[base]] : index - // FULL-UNROLL: %[[VEC1:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { - // FULL-UNROLL: vector.transfer_read %[[A]][%[[base]], %[[base]]], %[[C7]] : memref, vector<15xf32> - // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC0]] [0] : vector<15xf32> into vector<3x15xf32> - // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> - // FULL-UNROLL: } else { - // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> - // FULL-UNROLL: } - // FULL-UNROLL: affine.apply #[[$MAP1]]()[%[[base]]] - // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index - // FULL-UNROLL: %[[VEC2:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { - // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref, vector<15xf32> - // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC1]] [1] : vector<15xf32> into vector<3x15xf32> - // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> - // FULL-UNROLL: } else { - // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> - // FULL-UNROLL: } - // FULL-UNROLL: affine.apply #[[$MAP2]]()[%[[base]]] - // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index - // FULL-UNROLL: %[[VEC3:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { - // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref, vector<15xf32> - // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC2]] [2] : vector<15xf32> into vector<3x15xf32> - // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> - // FULL-UNROLL: } else { - // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> - // FULL-UNROLL: } - - %f = vector.transfer_read %A[%base, %base], %f7 : - memref, vector<3x15xf32> - - return %f: vector<3x15xf32> -} - -// ----- - -// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)> - -// FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)> -// FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)> - -// CHECK-LABEL: transfer_write_progressive( -// CHECK-SAME: %[[A:[a-zA-Z0-9]+]]: memref, -// CHECK-SAME: %[[base:[a-zA-Z0-9]+]]: index, -// CHECK-SAME: %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32> -// FULL-UNROLL-LABEL: transfer_write_progressive( -// FULL-UNROLL-SAME: %[[A:[a-zA-Z0-9]+]]: memref, -// FULL-UNROLL-SAME: %[[base:[a-zA-Z0-9]+]]: index, -// FULL-UNROLL-SAME: %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32> -func @transfer_write_progressive(%A : memref, %base: index, %vec: vector<3x15xf32>) { - // CHECK-DAG: %[[C0:.*]] = constant 0 : index - // CHECK-DAG: %[[C1:.*]] = constant 1 : index - // CHECK-DAG: %[[C3:.*]] = constant 3 : index - // CHECK: %[[alloc:.*]] = memref.alloca() : memref> - // CHECK: memref.store %[[vec]], %[[alloc]][] : memref> - // CHECK: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref> to memref<3xvector<15xf32>> - // CHECK: scf.for %[[I:.*]] = %[[C0]] to %[[C3]] - // CHECK: %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref - // CHECK: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]] - // CHECK: %[[cmp:.*]] = cmpi sgt, %[[dim]], %[[add]] : index - // CHECK: scf.if %[[cmp]] { - // CHECK: %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>> - // CHECK: vector.transfer_write %[[vec_1d]], %[[A]][{{.*}}, %[[base]]] : vector<15xf32>, memref - // CHECK: } - // CHECK: } - - // FULL-UNROLL: %[[C0:.*]] = constant 0 : index - // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref - // FULL-UNROLL: %[[CMP0:.*]] = cmpi sgt, %[[DIM]], %[[base]] : index - // FULL-UNROLL: scf.if %[[CMP0]] { - // FULL-UNROLL: %[[V0:.*]] = vector.extract %[[vec]][0] : vector<3x15xf32> - // FULL-UNROLL: vector.transfer_write %[[V0]], %[[A]][%[[base]], %[[base]]] : vector<15xf32>, memref - // FULL-UNROLL: } - // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]] - // FULL-UNROLL: %[[CMP1:.*]] = cmpi sgt, %{{.*}}, %[[I1]] : index - // FULL-UNROLL: scf.if %[[CMP1]] { - // FULL-UNROLL: %[[V1:.*]] = vector.extract %[[vec]][1] : vector<3x15xf32> - // FULL-UNROLL: vector.transfer_write %[[V1]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref - // FULL-UNROLL: } - // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]] - // FULL-UNROLL: %[[CMP2:.*]] = cmpi sgt, %{{.*}}, %[[I2]] : index - // FULL-UNROLL: scf.if %[[CMP2]] { - // FULL-UNROLL: %[[V2:.*]] = vector.extract %[[vec]][2] : vector<3x15xf32> - // FULL-UNROLL: vector.transfer_write %[[V2]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref - // FULL-UNROLL: } - - vector.transfer_write %vec, %A[%base, %base] : - vector<3x15xf32>, memref - return -} - -// ----- - -// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)> - -// FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)> -// FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)> - -// CHECK-LABEL: transfer_write_progressive_inbounds( -// CHECK-SAME: %[[A:[a-zA-Z0-9]+]]: memref, -// CHECK-SAME: %[[base:[a-zA-Z0-9]+]]: index, -// CHECK-SAME: %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32> -// FULL-UNROLL-LABEL: transfer_write_progressive_inbounds( -// FULL-UNROLL-SAME: %[[A:[a-zA-Z0-9]+]]: memref, -// FULL-UNROLL-SAME: %[[base:[a-zA-Z0-9]+]]: index, -// FULL-UNROLL-SAME: %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32> -func @transfer_write_progressive_inbounds(%A : memref, %base: index, %vec: vector<3x15xf32>) { - // CHECK-NOT: scf.if - // CHECK-DAG: %[[C0:.*]] = constant 0 : index - // CHECK-DAG: %[[C3:.*]] = constant 3 : index - // CHECK: %[[alloc:.*]] = memref.alloca() : memref> - // CHECK-NEXT: memref.store %[[vec]], %[[alloc]][] : memref> - // CHECK-NEXT: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref> to memref<3xvector<15xf32>> - // CHECK-NEXT: scf.for %[[I:.*]] = %[[C0]] to %[[C3]] - // CHECK-NEXT: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]] - // CHECK-NEXT: %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>> - // CHECK-NEXT: vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref - - // FULL-UNROLL: %[[VEC0:.*]] = vector.extract %[[vec]][0] : vector<3x15xf32> - // FULL-UNROLL: vector.transfer_write %[[VEC0]], %[[A]][%[[base]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref - // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]] - // FULL-UNROLL: %[[VEC1:.*]] = vector.extract %[[vec]][1] : vector<3x15xf32> - // FULL-UNROLL: vector.transfer_write %2, %[[A]][%[[I1]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref - // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]] - // FULL-UNROLL: %[[VEC2:.*]] = vector.extract %[[vec]][2] : vector<3x15xf32> - // FULL-UNROLL: vector.transfer_write %[[VEC2:.*]], %[[A]][%[[I2]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref - vector.transfer_write %vec, %A[%base, %base] {in_bounds = [true, true]} : - vector<3x15xf32>, memref - return -} - -// ----- - -// FULL-UNROLL-LABEL: transfer_read_simple -func @transfer_read_simple(%A : memref<2x2xf32>) -> vector<2x2xf32> { - %c0 = constant 0 : index - %f0 = constant 0.0 : f32 - // FULL-UNROLL-DAG: %[[VC0:.*]] = constant dense<0.000000e+00> : vector<2x2xf32> - // FULL-UNROLL-DAG: %[[C0:.*]] = constant 0 : index - // FULL-UNROLL-DAG: %[[C1:.*]] = constant 1 : index - // FULL-UNROLL: %[[V0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]] - // FULL-UNROLL: %[[RES0:.*]] = vector.insert %[[V0]], %[[VC0]] [0] : vector<2xf32> into vector<2x2xf32> - // FULL-UNROLL: %[[V1:.*]] = vector.transfer_read %{{.*}}[%[[C1]], %[[C0]]] - // FULL-UNROLL: %[[RES1:.*]] = vector.insert %[[V1]], %[[RES0]] [1] : vector<2xf32> into vector<2x2xf32> - %0 = vector.transfer_read %A[%c0, %c0], %f0 : memref<2x2xf32>, vector<2x2xf32> - return %0 : vector<2x2xf32> -} - -func @transfer_read_minor_identity(%A : memref) -> vector<3x3xf32> { - %c0 = constant 0 : index - %f0 = constant 0.0 : f32 - %0 = vector.transfer_read %A[%c0, %c0, %c0, %c0], %f0 - { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> } - : memref, vector<3x3xf32> - return %0 : vector<3x3xf32> -} - -// CHECK-LABEL: transfer_read_minor_identity( -// CHECK-SAME: %[[A:.*]]: memref) -> vector<3x3xf32> -// CHECK-DAG: %[[c0:.*]] = constant 0 : index -// CHECK-DAG: %[[c1:.*]] = constant 1 : index -// CHECK-DAG: %[[c2:.*]] = constant 2 : index -// CHECK-DAG: %[[c3:.*]] = constant 3 : index -// CHECK-DAG: %[[f0:.*]] = constant 0.000000e+00 : f32 -// CHECK-DAG: %[[cst0:.*]] = constant dense<0.000000e+00> : vector<3xf32> -// CHECK: %[[m:.*]] = memref.alloca() : memref> -// CHECK: %[[cast:.*]] = vector.type_cast %[[m]] : memref> to memref<3xvector<3xf32>> -// CHECK: scf.for %[[arg1:.*]] = %[[c0]] to %[[c3]] -// CHECK: %[[d:.*]] = memref.dim %[[A]], %[[c2]] : memref -// CHECK: %[[cmp:.*]] = cmpi sgt, %[[d]], %[[arg1]] : index -// CHECK: scf.if %[[cmp]] { -// CHECK: %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %[[f0]] : memref, vector<3xf32> -// CHECK: memref.store %[[tr]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>> -// CHECK: } else { -// CHECK: memref.store %[[cst0]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>> -// CHECK: } -// CHECK: } -// CHECK: %[[ret:.*]] = memref.load %[[m]][] : memref> -// CHECK: return %[[ret]] : vector<3x3xf32> - -func @transfer_write_minor_identity(%A : vector<3x3xf32>, %B : memref) { - %c0 = constant 0 : index - %f0 = constant 0.0 : f32 - vector.transfer_write %A, %B[%c0, %c0, %c0, %c0] - { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> } - : vector<3x3xf32>, memref - return -} - -// CHECK-LABEL: transfer_write_minor_identity( -// CHECK-SAME: %[[A:.*]]: vector<3x3xf32>, -// CHECK-SAME: %[[B:.*]]: memref) -// CHECK-DAG: %[[c0:.*]] = constant 0 : index -// CHECK-DAG: %[[c1:.*]] = constant 1 : index -// CHECK-DAG: %[[c2:.*]] = constant 2 : index -// CHECK-DAG: %[[c3:.*]] = constant 3 : index -// CHECK: %[[m:.*]] = memref.alloca() : memref> -// CHECK: memref.store %[[A]], %[[m]][] : memref> -// CHECK: %[[cast:.*]] = vector.type_cast %[[m]] : memref> to memref<3xvector<3xf32>> -// CHECK: scf.for %[[arg2:.*]] = %[[c0]] to %[[c3]] -// CHECK: %[[d:.*]] = memref.dim %[[B]], %[[c2]] : memref -// CHECK: %[[cmp:.*]] = cmpi sgt, %[[d]], %[[arg2]] : index -// CHECK: scf.if %[[cmp]] { -// CHECK: %[[tmp:.*]] = memref.load %[[cast]][%[[arg2]]] : memref<3xvector<3xf32>> -// CHECK: vector.transfer_write %[[tmp]], %[[B]][%[[c0]], %[[c0]], %[[arg2]], %[[c0]]] : vector<3xf32>, memref -// CHECK: } -// CHECK: } -// CHECK: return - - -// ----- - -func @transfer_read_strided(%A : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) -> vector<4xf32> { - %c0 = constant 0 : index - %f0 = constant 0.0 : f32 - %0 = vector.transfer_read %A[%c0, %c0], %f0 - : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>, vector<4xf32> - return %0 : vector<4xf32> -} - -// CHECK-LABEL: transfer_read_strided( -// CHECK: scf.for -// CHECK: memref.load - -func @transfer_write_strided(%A : vector<4xf32>, %B : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) { - %c0 = constant 0 : index - vector.transfer_write %A, %B[%c0, %c0] : - vector<4xf32>, memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>> - return -} - -// CHECK-LABEL: transfer_write_strided( -// CHECK: scf.for -// CHECK: store - diff --git a/mlir/test/Conversion/VectorToSCF/unrolled-vector-to-loops.mlir b/mlir/test/Conversion/VectorToSCF/unrolled-vector-to-loops.mlir --- a/mlir/test/Conversion/VectorToSCF/unrolled-vector-to-loops.mlir +++ b/mlir/test/Conversion/VectorToSCF/unrolled-vector-to-loops.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -test-unrolled-progressive-convert-vector-to-scf -split-input-file -allow-unregistered-dialect | FileCheck %s +// RUN: mlir-opt %s -convert-vector-to-scf=full-unroll=true -split-input-file -allow-unregistered-dialect | FileCheck %s // CHECK-LABEL: func @transfer_read_inbounds func @transfer_read_inbounds(%A : memref) -> (vector<2x3x4xf32>) { diff --git a/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir --- a/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir +++ b/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir @@ -18,10 +18,9 @@ // CHECK: scf.if // CHECK-NEXT: memref.load // CHECK-NEXT: vector.insertelement - // CHECK-NEXT: store + // CHECK-NEXT: scf.yield // CHECK-NEXT: else - // CHECK-NEXT: vector.insertelement - // CHECK-NEXT: store + // CHECK-NEXT: scf.yield // Add a dummy use to prevent dead code elimination from removing transfer // read ops. "dummy_use"(%f1, %f2, %f3, %f4) : (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> () @@ -65,37 +64,40 @@ // CHECK-LABEL: func @materialize_read(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) { func @materialize_read(%M: index, %N: index, %O: index, %P: index) { %f0 = constant 0.0: f32 - // CHECK-DAG: %[[ALLOC:.*]] = memref.alloca() : memref<5x4xvector<3xf32>> + // CHECK-DAG: %[[ALLOC:.*]] = memref.alloca() : memref> // CHECK-DAG: %[[C0:.*]] = constant 0 : index // CHECK-DAG: %[[C1:.*]] = constant 1 : index // CHECK-DAG: %[[C3:.*]] = constant 3 : index // CHECK-DAG: %[[C4:.*]] = constant 4 : index // CHECK-DAG: %[[C5:.*]] = constant 5 : index - // CHECK: %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref + // CHECK: %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref // CHECK-NEXT: affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 { // CHECK-NEXT: affine.for %[[I1:.*]] = 0 to %{{.*}} { // CHECK-NEXT: affine.for %[[I2:.*]] = 0 to %{{.*}} { // CHECK-NEXT: affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 { - // CHECK-NEXT: scf.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] { - // CHECK-NEXT: scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { - // CHECK-NEXT: scf.for %[[I6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] { - // CHECK: %[[VIDX:.*]] = index_cast %[[I4]] - // CHECK: %[[VEC:.*]] = memref.load %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>> - // CHECK: %[[L0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I4]]) - // CHECK: %[[L3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I6]]) - // CHECK-NEXT: scf.if - // CHECK-NEXT: %[[SCAL:.*]] = memref.load %{{.*}}[%[[L0]], %[[I1]], %[[I2]], %[[L3]]] : memref - // CHECK-NEXT: %[[RVEC:.*]] = vector.insertelement %[[SCAL]], %[[VEC]][%[[VIDX]] : i32] : vector<3xf32> - // CHECK-NEXT: store %[[RVEC]], %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>> - // CHECK-NEXT: } else { - // CHECK-NEXT: %[[CVEC:.*]] = vector.insertelement - // CHECK-NEXT: store %[[CVEC]], %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>> + // CHECK: scf.for %[[I4:.*]] = %[[C0]] to %[[C5]] step %[[C1]] { + // CHECK: scf.if + // CHECK: %[[L3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I4]]) + // CHECK: scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { + // CHECK: %[[VEC:.*]] = scf.for %[[I6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<3xf32>) { + // CHECK: %[[L0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I6]]) + // CHECK: %[[VIDX:.*]] = index_cast %[[I6]] + // CHECK: scf.if {{.*}} -> (vector<3xf32>) { + // CHECK-NEXT: %[[SCAL:.*]] = memref.load %{{.*}}[%[[L0]], %[[I1]], %[[I2]], %[[L3]]] : memref + // CHECK-NEXT: %[[RVEC:.*]] = vector.insertelement %[[SCAL]], %{{.*}}[%[[VIDX]] : i32] : vector<3xf32> + // CHECK-NEXT: scf.yield + // CHECK-NEXT: } else { + // CHECK-NEXT: scf.yield + // CHECK-NEXT: } + // CHECK-NEXT: scf.yield // CHECK-NEXT: } + // CHECK-NEXT: memref.store %[[VEC]], {{.*}} : memref<5x4xvector<3xf32>> // CHECK-NEXT: } + // CHECK-NEXT: } else { + // CHECK-NEXT: memref.store {{.*}} : memref<5xvector<4x3xf32>> // CHECK-NEXT: } // CHECK-NEXT: } - // CHECK-NEXT: %[[ALLOC_CAST:.*]] = vector.type_cast %[[ALLOC]] : memref<5x4xvector<3xf32>> to memref> - // CHECK-NEXT: %[[LD:.*]] = memref.load %[[ALLOC_CAST]][] : memref> + // CHECK-NEXT: %[[LD:.*]] = memref.load %[[ALLOC]][] : memref> // CHECK-NEXT: "dummy_use"(%[[LD]]) : (vector<5x4x3xf32>) -> () // CHECK-NEXT: } // CHECK-NEXT: } @@ -129,42 +131,46 @@ // CHECK-LABEL:func @materialize_write(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) { func @materialize_write(%M: index, %N: index, %O: index, %P: index) { - // CHECK-DAG: %[[ALLOC:.*]] = memref.alloca() : memref<5x4xvector<3xf32>> + // CHECK-DAG: %[[ALLOC:.*]] = memref.alloca() : memref> // CHECK-DAG: %{{.*}} = constant dense<1.000000e+00> : vector<5x4x3xf32> // CHECK-DAG: %[[C0:.*]] = constant 0 : index // CHECK-DAG: %[[C1:.*]] = constant 1 : index // CHECK-DAG: %[[C3:.*]] = constant 3 : index // CHECK-DAG: %[[C4:.*]] = constant 4 : index // CHECK-DAG: %[[C5:.*]] = constant 5 : index - // CHECK: %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref - // CHECK-NEXT: affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 { - // CHECK-NEXT: affine.for %[[I1:.*]] = 0 to %{{.*}} step 4 { - // CHECK-NEXT: affine.for %[[I2:.*]] = 0 to %{{.*}} { - // CHECK-NEXT: affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 { - // CHECK-NEXT: %[[VECTOR_VIEW:.*]] = vector.type_cast {{.*}} : memref<5x4xvector<3xf32>> - // CHECK: store %{{.*}}, {{.*}} : memref> - // CHECK-NEXT: scf.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] { - // CHECK-NEXT: scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { - // CHECK-NEXT: scf.for %[[I6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] { - // CHECK: %[[VIDX:.*]] = index_cast %[[I4]] - // CHECK: %[[S0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I4]]) - // CHECK: %[[S1:.*]] = affine.apply #[[$ADD]](%[[I1]], %[[I5]]) - // CHECK: %[[S3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I6]]) - // CHECK-NEXT: scf.if - // CHECK-NEXT: %[[VEC:.*]] = memref.load {{.*}}[%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>> - // CHECK-NEXT: %[[SCAL:.*]] = vector.extractelement %[[VEC]][%[[VIDX]] : i32] : vector<3xf32> - // CHECK: store %[[SCAL]], {{.*}}[%[[S0]], %[[S1]], %[[I2]], %[[S3]]] : memref - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: } - // CHECK-NEXT: return - // CHECK-NEXT:} - // + // CHECK: %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref + // CHECK-NEXT: affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 { + // CHECK-NEXT: affine.for %[[I1:.*]] = 0 to %{{.*}} step 4 { + // CHECK-NEXT: affine.for %[[I2:.*]] = 0 to %{{.*}} { + // CHECK-NEXT: affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 { + // CHECK: memref.store %{{.*}}, %[[ALLOC]][] : memref> + // CHECK: %[[VECTOR_VIEW1:.*]] = vector.type_cast %[[ALLOC]] : memref> to memref<5xvector<4x3xf32>> + // CHECK: scf.for %[[I4:.*]] = %[[C0]] to %[[C5]] step %[[C1]] { + // CHECK: scf.if + // CHECK: %[[S3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I4]]) + // CHECK: %[[VECTOR_VIEW2:.*]] = vector.type_cast %[[VECTOR_VIEW1]] : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>> + // CHECK: scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { + // CHECK: scf.if + // CHECK: %[[S1:.*]] = affine.apply #[[$ADD]](%[[I1]], %[[I5]]) + // CHECK: %[[VEC:.*]] = memref.load %[[VECTOR_VIEW2]][%[[I4]], %[[I5]]] : memref<5x4xvector<3xf32>> + // CHECK: scf.for %[[I6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] { + // CHECK: %[[S0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I6]]) + // CHECK: %[[VIDX:.*]] = index_cast %[[I6]] + // CHECK: scf.if + // CHECK: %[[SCAL:.*]] = vector.extractelement %[[VEC]][%[[VIDX]] : i32] : vector<3xf32> + // CHECK: memref.store %[[SCAL]], {{.*}}[%[[S0]], %[[S1]], %[[I2]], %[[S3]]] : memref + // CHECK: } + // CHECK: } + // CHECK: } + // CHECK: } + // CHECK: } + // CHECK: } + // CHECK: } + // CHECK: } + // CHECK: } + // CHECK: } + // CHECK: return + // Check that I0 + I4 (of size 3) read from last index load(..., I4) and write into first index store(S0, ...) // Check that I1 + I5 (of size 4) read from second index load(..., I5, ...) and write into second index store(..., S1, ...) // Check that I3 + I6 (of size 5) read from first index load(I6, ...) and write into last index store(..., S3) @@ -203,53 +209,52 @@ %f7 = constant 7.0: f32 // CHECK-DAG: %[[C7:.*]] = constant 7.000000e+00 : f32 // CHECK-DAG: %[[C0:.*]] = constant 0 : index + // CHECK-DAG: %[[C1:.*]] = constant 1 : index + // CHECK-DAG: %[[C3:.*]] = constant 3 : index // CHECK-DAG: %[[splat:.*]] = constant dense<7.000000e+00> : vector<15xf32> - // CHECK-DAG: %[[alloc:.*]] = memref.alloca() : memref<3xvector<15xf32>> - // CHECK-DAG: %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref - // CHECK: affine.for %[[I:.*]] = 0 to 3 { - // CHECK: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]] - // CHECK: %[[cond1:.*]] = cmpi slt, %[[add]], %[[dim]] : index - // CHECK: scf.if %[[cond1]] { - // CHECK: %[[vec_1d:.*]] = vector.transfer_read %[[A]][%[[add]], %[[base]]], %[[C7]] : memref, vector<15xf32> - // CHECK: store %[[vec_1d]], %[[alloc]][%[[I]]] : memref<3xvector<15xf32>> - // CHECK: } else { - // CHECK: store %[[splat]], %[[alloc]][%[[I]]] : memref<3xvector<15xf32>> - // CHECK: } - // CHECK: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<3xvector<15xf32>> to memref> - // CHECK: %[[cst:.*]] = memref.load %[[vmemref]][] : memref> + // CHECK-DAG: %[[alloc:.*]] = memref.alloca() : memref> + // CHECK: %[[alloc_casted:.*]] = vector.type_cast %[[alloc]] : memref> to memref<3xvector<15xf32>> + // CHECK: scf.for %[[I:.*]] = %[[C0]] to %[[C3]] + // CHECK: %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref + // CHECK: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]] + // CHECK: %[[cond1:.*]] = cmpi sgt, %[[dim]], %[[add]] : index + // CHECK: scf.if %[[cond1]] { + // CHECK: %[[vec_1d:.*]] = vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref, vector<15xf32> + // CHECK: memref.store %[[vec_1d]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>> + // CHECK: } else { + // CHECK: store %[[splat]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>> + // CHECK: } + // CHECK: } + // CHECK: %[[cst:.*]] = memref.load %[[alloc]][] : memref> // FULL-UNROLL: %[[C7:.*]] = constant 7.000000e+00 : f32 // FULL-UNROLL: %[[VEC0:.*]] = constant dense<7.000000e+00> : vector<3x15xf32> // FULL-UNROLL: %[[C0:.*]] = constant 0 : index - // FULL-UNROLL: %[[SPLAT:.*]] = constant dense<7.000000e+00> : vector<15xf32> // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref - // FULL-UNROLL: cmpi slt, %[[base]], %[[DIM]] : index + // FULL-UNROLL: cmpi sgt, %[[DIM]], %[[base]] : index // FULL-UNROLL: %[[VEC1:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { // FULL-UNROLL: vector.transfer_read %[[A]][%[[base]], %[[base]]], %[[C7]] : memref, vector<15xf32> // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC0]] [0] : vector<15xf32> into vector<3x15xf32> // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> // FULL-UNROLL: } else { - // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC0]] [0] : vector<15xf32> into vector<3x15xf32> // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> // FULL-UNROLL: } // FULL-UNROLL: affine.apply #[[$MAP1]]()[%[[base]]] - // FULL-UNROLL: cmpi slt, %{{.*}}, %[[DIM]] : index + // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index // FULL-UNROLL: %[[VEC2:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref, vector<15xf32> // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC1]] [1] : vector<15xf32> into vector<3x15xf32> // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> // FULL-UNROLL: } else { - // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC1]] [1] : vector<15xf32> into vector<3x15xf32> // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> // FULL-UNROLL: } // FULL-UNROLL: affine.apply #[[$MAP2]]()[%[[base]]] - // FULL-UNROLL: cmpi slt, %{{.*}}, %[[DIM]] : index + // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index // FULL-UNROLL: %[[VEC3:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref, vector<15xf32> // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC2]] [2] : vector<15xf32> into vector<3x15xf32> // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> // FULL-UNROLL: } else { - // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC2]] [2] : vector<15xf32> into vector<3x15xf32> // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> // FULL-UNROLL: } @@ -275,37 +280,40 @@ // FULL-UNROLL-SAME: %[[base:[a-zA-Z0-9]+]]: index, // FULL-UNROLL-SAME: %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32> func @transfer_write_progressive(%A : memref, %base: index, %vec: vector<3x15xf32>) { - // CHECK: %[[C0:.*]] = constant 0 : index - // CHECK: %[[alloc:.*]] = memref.alloca() : memref<3xvector<15xf32>> - // CHECK: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<3xvector<15xf32>> to memref> - // CHECK: store %[[vec]], %[[vmemref]][] : memref> - // CHECK: %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref - // CHECK: affine.for %[[I:.*]] = 0 to 3 { - // CHECK: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]] - // CHECK: %[[cmp:.*]] = cmpi slt, %[[add]], %[[dim]] : index - // CHECK: scf.if %[[cmp]] { - // CHECK: %[[vec_1d:.*]] = memref.load %0[%[[I]]] : memref<3xvector<15xf32>> - // CHECK: vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] : vector<15xf32>, memref - // CHECK: } + // CHECK-DAG: %[[C0:.*]] = constant 0 : index + // CHECK-DAG: %[[C1:.*]] = constant 1 : index + // CHECK-DAG: %[[C3:.*]] = constant 3 : index + // CHECK: %[[alloc:.*]] = memref.alloca() : memref> + // CHECK: memref.store %[[vec]], %[[alloc]][] : memref> + // CHECK: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref> to memref<3xvector<15xf32>> + // CHECK: scf.for %[[I:.*]] = %[[C0]] to %[[C3]] + // CHECK: %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref + // CHECK: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]] + // CHECK: %[[cmp:.*]] = cmpi sgt, %[[dim]], %[[add]] : index + // CHECK: scf.if %[[cmp]] { + // CHECK: %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>> + // CHECK: vector.transfer_write %[[vec_1d]], %[[A]][{{.*}}, %[[base]]] : vector<15xf32>, memref + // CHECK: } + // CHECK: } // FULL-UNROLL: %[[C0:.*]] = constant 0 : index // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref - // FULL-UNROLL: %[[CMP0:.*]] = cmpi slt, %[[base]], %[[DIM]] : index + // FULL-UNROLL: %[[CMP0:.*]] = cmpi sgt, %[[DIM]], %[[base]] : index // FULL-UNROLL: scf.if %[[CMP0]] { // FULL-UNROLL: %[[V0:.*]] = vector.extract %[[vec]][0] : vector<3x15xf32> // FULL-UNROLL: vector.transfer_write %[[V0]], %[[A]][%[[base]], %[[base]]] : vector<15xf32>, memref // FULL-UNROLL: } // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]] - // FULL-UNROLL: %[[CMP1:.*]] = cmpi slt, %[[I1]], %[[DIM]] : index + // FULL-UNROLL: %[[CMP1:.*]] = cmpi sgt, %{{.*}}, %[[I1]] : index // FULL-UNROLL: scf.if %[[CMP1]] { // FULL-UNROLL: %[[V1:.*]] = vector.extract %[[vec]][1] : vector<3x15xf32> - // FULL-UNROLL: vector.transfer_write %[[V1]], %[[A]][%[[I1]], %[[base]]] : vector<15xf32>, memref + // FULL-UNROLL: vector.transfer_write %[[V1]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref // FULL-UNROLL: } // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]] - // FULL-UNROLL: %[[CMP2:.*]] = cmpi slt, %[[I2]], %[[DIM]] : index + // FULL-UNROLL: %[[CMP2:.*]] = cmpi sgt, %{{.*}}, %[[I2]] : index // FULL-UNROLL: scf.if %[[CMP2]] { // FULL-UNROLL: %[[V2:.*]] = vector.extract %[[vec]][2] : vector<3x15xf32> - // FULL-UNROLL: vector.transfer_write %[[V2]], %[[A]][%[[I2]], %[[base]]] : vector<15xf32>, memref + // FULL-UNROLL: vector.transfer_write %[[V2]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref // FULL-UNROLL: } vector.transfer_write %vec, %A[%base, %base] : @@ -330,12 +338,14 @@ // FULL-UNROLL-SAME: %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32> func @transfer_write_progressive_inbounds(%A : memref, %base: index, %vec: vector<3x15xf32>) { // CHECK-NOT: scf.if - // CHECK-NEXT: %[[alloc:.*]] = memref.alloca() : memref<3xvector<15xf32>> - // CHECK-NEXT: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<3xvector<15xf32>> to memref> - // CHECK-NEXT: store %[[vec]], %[[vmemref]][] : memref> - // CHECK-NEXT: affine.for %[[I:.*]] = 0 to 3 { + // CHECK-DAG: %[[C0:.*]] = constant 0 : index + // CHECK-DAG: %[[C3:.*]] = constant 3 : index + // CHECK: %[[alloc:.*]] = memref.alloca() : memref> + // CHECK-NEXT: memref.store %[[vec]], %[[alloc]][] : memref> + // CHECK-NEXT: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref> to memref<3xvector<15xf32>> + // CHECK-NEXT: scf.for %[[I:.*]] = %[[C0]] to %[[C3]] // CHECK-NEXT: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]] - // CHECK-NEXT: %[[vec_1d:.*]] = memref.load %0[%[[I]]] : memref<3xvector<15xf32>> + // CHECK-NEXT: %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>> // CHECK-NEXT: vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref // FULL-UNROLL: %[[VEC0:.*]] = vector.extract %[[vec]][0] : vector<3x15xf32> @@ -378,25 +388,27 @@ } // CHECK-LABEL: transfer_read_minor_identity( -// CHECK-SAME: %[[A:.*]]: memref) -> vector<3x3xf32> -// CHECK-DAG: %[[c0:.*]] = constant 0 : index -// CHECK-DAG: %[[f0:.*]] = constant 0.000000e+00 : f32 -// CHECK-DAG: %[[c2:.*]] = constant 2 : index -// CHECK-DAG: %[[cst0:.*]] = constant dense<0.000000e+00> : vector<3xf32> -// CHECK: %[[m:.*]] = memref.alloca() : memref<3xvector<3xf32>> -// CHECK: %[[d:.*]] = memref.dim %[[A]], %[[c2]] : memref -// CHECK: affine.for %[[arg1:.*]] = 0 to 3 { -// CHECK: %[[cmp:.*]] = cmpi slt, %[[arg1]], %[[d]] : index -// CHECK: scf.if %[[cmp]] { -// CHECK: %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %[[f0]] : memref, vector<3xf32> -// CHECK: store %[[tr]], %[[m]][%[[arg1]]] : memref<3xvector<3xf32>> -// CHECK: } else { -// CHECK: store %[[cst0]], %[[m]][%[[arg1]]] : memref<3xvector<3xf32>> -// CHECK: } -// CHECK: } -// CHECK: %[[cast:.*]] = vector.type_cast %[[m]] : memref<3xvector<3xf32>> to memref> -// CHECK: %[[ret:.*]] = memref.load %[[cast]][] : memref> -// CHECK: return %[[ret]] : vector<3x3xf32> +// CHECK-SAME: %[[A:.*]]: memref) -> vector<3x3xf32> +// CHECK-DAG: %[[c0:.*]] = constant 0 : index +// CHECK-DAG: %[[c1:.*]] = constant 1 : index +// CHECK-DAG: %[[c2:.*]] = constant 2 : index +// CHECK-DAG: %[[c3:.*]] = constant 3 : index +// CHECK-DAG: %[[f0:.*]] = constant 0.000000e+00 : f32 +// CHECK-DAG: %[[cst0:.*]] = constant dense<0.000000e+00> : vector<3xf32> +// CHECK: %[[m:.*]] = memref.alloca() : memref> +// CHECK: %[[cast:.*]] = vector.type_cast %[[m]] : memref> to memref<3xvector<3xf32>> +// CHECK: scf.for %[[arg1:.*]] = %[[c0]] to %[[c3]] +// CHECK: %[[d:.*]] = memref.dim %[[A]], %[[c2]] : memref +// CHECK: %[[cmp:.*]] = cmpi sgt, %[[d]], %[[arg1]] : index +// CHECK: scf.if %[[cmp]] { +// CHECK: %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %[[f0]] : memref, vector<3xf32> +// CHECK: memref.store %[[tr]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>> +// CHECK: } else { +// CHECK: memref.store %[[cst0]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>> +// CHECK: } +// CHECK: } +// CHECK: %[[ret:.*]] = memref.load %[[m]][] : memref> +// CHECK: return %[[ret]] : vector<3x3xf32> func @transfer_write_minor_identity(%A : vector<3x3xf32>, %B : memref) { %c0 = constant 0 : index @@ -408,22 +420,25 @@ } // CHECK-LABEL: transfer_write_minor_identity( -// CHECK-SAME: %[[A:.*]]: vector<3x3xf32>, -// CHECK-SAME: %[[B:.*]]: memref) -// CHECK-DAG: %[[c2:.*]] = constant 2 : index -// CHECK-DAG: %[[c0:.*]] = constant 0 : index -// CHECK: %[[m:.*]] = memref.alloca() : memref<3xvector<3xf32>> -// CHECK: %[[cast:.*]] = vector.type_cast %[[m]] : memref<3xvector<3xf32>> to memref> -// CHECK: store %[[A]], %[[cast]][] : memref> -// CHECK: %[[d:.*]] = memref.dim %[[B]], %[[c2]] : memref -// CHECK: affine.for %[[arg2:.*]] = 0 to 3 { -// CHECK: %[[cmp:.*]] = cmpi slt, %[[arg2]], %[[d]] : index -// CHECK: scf.if %[[cmp]] { -// CHECK: %[[tmp:.*]] = memref.load %[[m]][%[[arg2]]] : memref<3xvector<3xf32>> -// CHECK: vector.transfer_write %[[tmp]], %[[B]][%[[c0]], %[[c0]], %[[arg2]], %[[c0]]] : vector<3xf32>, memref -// CHECK: } -// CHECK: } -// CHECK: return +// CHECK-SAME: %[[A:.*]]: vector<3x3xf32>, +// CHECK-SAME: %[[B:.*]]: memref) +// CHECK-DAG: %[[c0:.*]] = constant 0 : index +// CHECK-DAG: %[[c1:.*]] = constant 1 : index +// CHECK-DAG: %[[c2:.*]] = constant 2 : index +// CHECK-DAG: %[[c3:.*]] = constant 3 : index +// CHECK: %[[m:.*]] = memref.alloca() : memref> +// CHECK: memref.store %[[A]], %[[m]][] : memref> +// CHECK: %[[cast:.*]] = vector.type_cast %[[m]] : memref> to memref<3xvector<3xf32>> +// CHECK: scf.for %[[arg2:.*]] = %[[c0]] to %[[c3]] +// CHECK: %[[d:.*]] = memref.dim %[[B]], %[[c2]] : memref +// CHECK: %[[cmp:.*]] = cmpi sgt, %[[d]], %[[arg2]] : index +// CHECK: scf.if %[[cmp]] { +// CHECK: %[[tmp:.*]] = memref.load %[[cast]][%[[arg2]]] : memref<3xvector<3xf32>> +// CHECK: vector.transfer_write %[[tmp]], %[[B]][%[[c0]], %[[c0]], %[[arg2]], %[[c0]]] : vector<3xf32>, memref +// CHECK: } +// CHECK: } +// CHECK: return + // ----- diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir --- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-1d.mlir @@ -1,9 +1,9 @@ -// RUN: mlir-opt %s -test-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ // RUN: mlir-cpu-runner -e entry -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // RUN: FileCheck %s -// RUN: mlir-opt %s -test-unrolled-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-opt %s -convert-vector-to-scf=full-unroll=true -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ // RUN: mlir-cpu-runner -e entry -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // RUN: FileCheck %s diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir --- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-2d.mlir @@ -1,9 +1,9 @@ -// RUN: mlir-opt %s -test-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ // RUN: mlir-cpu-runner -e entry -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // RUN: FileCheck %s -// RUN: mlir-opt %s -test-unrolled-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-opt %s -convert-vector-to-scf=full-unroll=true -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ // RUN: mlir-cpu-runner -e entry -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // RUN: FileCheck %s diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir --- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read-3d.mlir @@ -1,10 +1,10 @@ -// RUN: mlir-opt %s -test-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ // RUN: mlir-cpu-runner -e entry -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // RUN: FileCheck %s -// RUN: mlir-opt %s -test-unrolled-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ +// RUN: mlir-opt %s -convert-vector-to-scf=full-unroll=true -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // RUN: FileCheck %s diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read.mlir --- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read.mlir @@ -1,4 +1,9 @@ -// RUN: mlir-opt %s -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ +// RUN: FileCheck %s + +// RUN: mlir-opt %s -convert-vector-to-scf=full-unroll=true -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ // RUN: mlir-cpu-runner -e entry -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // RUN: FileCheck %s diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir --- a/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/test-transfer-to-loops.mlir @@ -3,7 +3,7 @@ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext,%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // RUN: FileCheck %s -// RUN: mlir-opt %s -test-progressive-convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ +// RUN: mlir-opt %s -convert-vector-to-scf=full-unroll=true -lower-affine -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext,%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ // RUN: FileCheck %s diff --git a/mlir/test/lib/Transforms/TestVectorTransforms.cpp b/mlir/test/lib/Transforms/TestVectorTransforms.cpp --- a/mlir/test/lib/Transforms/TestVectorTransforms.cpp +++ b/mlir/test/lib/Transforms/TestVectorTransforms.cpp @@ -9,7 +9,6 @@ #include #include "mlir/Analysis/SliceAnalysis.h" -#include "mlir/Conversion/VectorToSCF/ProgressiveVectorToSCF.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" @@ -390,23 +389,6 @@ } }; -template -struct TestProgressiveVectorToSCFLoweringPatterns - : public PassWrapper, - FunctionPass> { - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - } - void runOnFunction() override { - RewritePatternSet patterns(&this->getContext()); - ProgressiveVectorTransferToSCFOptions options; - options.unroll = Unroll; - populateProgressiveVectorToSCFConversionPatterns(patterns, options); - (void)applyPatternsAndFoldGreedily(this->getFunction(), - std::move(patterns)); - } -}; - } // end anonymous namespace namespace mlir { @@ -454,19 +436,6 @@ "test-vector-transfer-lowering-patterns", "Test conversion patterns to lower transfer ops to other vector ops"); - PassRegistration> - transferOpToSCF("test-progressive-convert-vector-to-scf", - "Test conversion patterns to progressively lower " - "transfer ops to SCF"); - - PassRegistration> - transferOpToSCFUnrolled( - "test-unrolled-progressive-convert-vector-to-scf", - "Test conversion patterns to progressively lower transfer ops to SCF" - "(unrolled variant)"); - PassRegistration multiDimReductionOpLoweringPass( "test-vector-multi-reduction-lowering-patterns",