Diff 339837

mlir/include/mlir/Analysis/Utils.h

	Show First 20 Lines • Show All 348 Lines • ▼ Show 20 Lines
	/// Returns the number of surrounding loops common to both A and B.			/// Returns the number of surrounding loops common to both A and B.
	unsigned getNumCommonSurroundingLoops(Operation &A, Operation &B);			unsigned getNumCommonSurroundingLoops(Operation &A, Operation &B);

	/// Gets the memory footprint of all data touched in the specified memory space			/// Gets the memory footprint of all data touched in the specified memory space
	/// in bytes; if the memory space is unspecified, considers all memory spaces.			/// in bytes; if the memory space is unspecified, considers all memory spaces.
	Optional<int64_t> getMemoryFootprintBytes(AffineForOp forOp,			Optional<int64_t> getMemoryFootprintBytes(AffineForOp forOp,
	int memorySpace = -1);			int memorySpace = -1);

	/// Returns true if `forOp' is a parallel loop.			/// Returns true if `forOp' is a parallel loop. By default loops with
	bool isLoopParallel(AffineForOp forOp);			/// loop-carried variables (iter_args) are considered non-parallel, unless
				/// `ignoreIterArgs = true` is provided, in which case only memory operations
				/// are checked. See also `mlir::isParallelReductionLoop`.
				bondhugulaUnsubmitted Not Done Reply Inline Actions Making the method return true with the `ignoreIterArgs` set to true looks hacky. That's an incorrect return result since the loop isn't actually parallel. I'd recommend making the change suggested in your comment below - since it's in the right direction to start with. bondhugula: Making the method return true with the `ignoreIterArgs` set to true looks hacky. That's an…
				sgrechanikAuthorUnsubmitted Not Done Reply Inline Actions Ok, I'll split it further then, so that we'll have three functions: isLoopMemoryParallel (better name suggestions are welcome) that will check only memory dependences isLoopParallel that will work as the old version of isLoopParallel, i.e. loops with iter_args will be considered non-parallel. isParallelReductionLoop that will check both memory dependences and try to recognize reduction loops. sgrechanik: Ok, I'll split it further then, so that we'll have three functions: - isLoopMemoryParallel…
				bool isLoopParallel(AffineForOp forOp, bool ignoreIterArgs = false);

	/// Simplify the integer set by simplifying the underlying affine expressions by			/// Simplify the integer set by simplifying the underlying affine expressions by
				sgrechanikAuthorUnsubmitted Done Reply Inline Actions After talking to my colleagues I decided that I overcomplicated this function. I'll split it in the next patch update into two functions: isLoopParallel and isParallelReductionLoop. sgrechanik: After talking to my colleagues I decided that I overcomplicated this function. I'll split it in…
	/// flattening and some simple inference. Also, drop any duplicate constraints.			/// flattening and some simple inference. Also, drop any duplicate constraints.
	/// Returns the simplified integer set. This method runs in time linear in the			/// Returns the simplified integer set. This method runs in time linear in the
	/// number of constraints.			/// number of constraints.
	IntegerSet simplifyIntegerSet(IntegerSet set);			IntegerSet simplifyIntegerSet(IntegerSet set);

	/// Returns the innermost common loop depth for the set of operations in 'ops'.			/// Returns the innermost common loop depth for the set of operations in 'ops'.
	unsigned getInnermostCommonLoopDepth(			unsigned getInnermostCommonLoopDepth(
	ArrayRef<Operation *> ops,			ArrayRef<Operation *> ops,
	SmallVectorImpl<AffineForOp> *surroundingLoops = nullptr);			SmallVectorImpl<AffineForOp> *surroundingLoops = nullptr);

	} // end namespace mlir			} // end namespace mlir

	#endif // MLIR_ANALYSIS_UTILS_H			#endif // MLIR_ANALYSIS_UTILS_H

mlir/include/mlir/Dialect/Affine/Passes.td

Show First 20 Lines • Show All 106 Lines • ▼ Show 20 Lines	let options = [
// the index represents the loop depth, the value represents the k^th		// the index represents the loop depth, the value represents the k^th
// fastest varying memory dimension.		// fastest varying memory dimension.
// This is voluntarily restrictive and is meant to precisely target a		// This is voluntarily restrictive and is meant to precisely target a
// particular loop/op pair, for testing purposes.		// particular loop/op pair, for testing purposes.
ListOption<"fastestVaryingPattern", "test-fastest-varying", "int64_t",		ListOption<"fastestVaryingPattern", "test-fastest-varying", "int64_t",
"Specify a 1-D, 2-D or 3-D pattern of fastest varying memory "		"Specify a 1-D, 2-D or 3-D pattern of fastest varying memory "
"dimensions to match. See defaultPatterns in Vectorize.cpp for "		"dimensions to match. See defaultPatterns in Vectorize.cpp for "
"a description and examples. This is used for testing purposes",		"a description and examples. This is used for testing purposes",
"llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">		"llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
		Option<"vectorizeReductions", "vectorize-reductions", "bool",
		/default=/"false",
		"Vectorize known reductions expressed via iter_args. "
		"Switched off by default.">
];		];
}		}

def AffineParallelize : FunctionPass<"affine-parallelize"> {		def AffineParallelize : FunctionPass<"affine-parallelize"> {
let summary = "Convert affine.for ops into 1-D affine.parallel";		let summary = "Convert affine.for ops into 1-D affine.parallel";
let constructor = "mlir::createAffineParallelizePass()";		let constructor = "mlir::createAffineParallelizePass()";
let options = [		let options = [
Option<"maxNested", "max-nested", "unsigned", /default=/"-1u",		Option<"maxNested", "max-nested", "unsigned", /default=/"-1u",
Show All 17 Lines

mlir/include/mlir/Dialect/Affine/Utils.h

	//===- Utils.h - Affine dialect utilities ------------------------ C++ --===//			//===- Utils.h - Affine dialect utilities ------------------------ C++ --===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// This header file declares a set of utilities for the affine dialect ops.			// This header file declares a set of utilities for the affine dialect ops.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#ifndef MLIR_DIALECT_AFFINE_UTILS_H			#ifndef MLIR_DIALECT_AFFINE_UTILS_H
	#define MLIR_DIALECT_AFFINE_UTILS_H			#define MLIR_DIALECT_AFFINE_UTILS_H

				#include "mlir/Dialect/StandardOps/IR/Ops.h"
	#include "mlir/IR/AffineExpr.h"			#include "mlir/IR/AffineExpr.h"
				#include "mlir/IR/Builders.h"
				#include "mlir/IR/Value.h"
	#include "mlir/Support/LLVM.h"			#include "mlir/Support/LLVM.h"
	#include "llvm/ADT/DenseMap.h"			#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/SmallVector.h"			#include "llvm/ADT/SmallVector.h"
				bondhugulaUnsubmitted Not Done Reply Inline Actions Prune includes please. bondhugula: Prune includes please.

	namespace mlir {			namespace mlir {

	class AffineForOp;			class AffineForOp;
	class AffineIfOp;			class AffineIfOp;
	class AffineParallelOp;			class AffineParallelOp;
	struct LogicalResult;			struct LogicalResult;
	class Operation;			class Operation;
				class ReductionRecognizer;

	/// Replaces parallel affine.for op with 1-d affine.parallel op.			/// Replaces parallel affine.for op with 1-d affine.parallel op.
	/// mlir::isLoopParallel detect the parallel affine.for ops.			/// mlir::isLoopParallel detect the parallel affine.for ops.
	/// There is no cost model currently used to drive this parallelization.			/// There is no cost model currently used to drive this parallelization.
	void affineParallelize(AffineForOp forOp);			void affineParallelize(AffineForOp forOp);

	/// Hoists out affine.if/else to as high as possible, i.e., past all invariant			/// Hoists out affine.if/else to as high as possible, i.e., past all invariant
	/// affine.fors/parallel's. Returns success if any hoisting happened; folded` is			/// affine.fors/parallel's. Returns success if any hoisting happened; folded` is
	Show All 35 Lines
	struct VectorizationStrategy {			struct VectorizationStrategy {
	// Vectorization factors to apply to each target vector dimension.			// Vectorization factors to apply to each target vector dimension.
	// Each factor will be applied to a different loop.			// Each factor will be applied to a different loop.
	SmallVector<int64_t, 8> vectorSizes;			SmallVector<int64_t, 8> vectorSizes;
	// Maps each AffineForOp vectorization candidate with its vector dimension.			// Maps each AffineForOp vectorization candidate with its vector dimension.
	// The candidate will be vectorized using the vectorization factor in			// The candidate will be vectorized using the vectorization factor in
	// 'vectorSizes' for that dimension.			// 'vectorSizes' for that dimension.
	DenseMap<Operation *, unsigned> loopToVectorDim;			DenseMap<Operation *, unsigned> loopToVectorDim;
				// An optional reduction recognizer that will be used to recognize reduction
				// loops vectorizable along the reduction dimension.
				const ReductionRecognizer *reductionRecognizer = nullptr;
	};			};

	/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in			/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
	/// 'vectorSizes'. By default, each vectorization factor is applied			/// 'vectorSizes'. By default, each vectorization factor is applied
	/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can			/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
	/// be optionally used to provide a different loop vectorization order.			/// be optionally used to provide a different loop vectorization order.
				/// If `reductionRecognizer` is not null, recognized reduction loops may be
				/// vectorized along the reduction dimension.
				/// TODO: Vectorizing reductions is supported only for 1-D vectorization.
	void vectorizeAffineLoops(			void vectorizeAffineLoops(
	Operation *parentOp,			Operation *parentOp,
	llvm::DenseSet<Operation , DenseMapInfo<Operation >> &loops,			llvm::DenseSet<Operation , DenseMapInfo<Operation >> &loops,
	ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern);			ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern,
				const ReductionRecognizer *reductionRecognizer = nullptr);

	/// External utility to vectorize affine loops from a single loop nest using an			/// External utility to vectorize affine loops from a single loop nest using an
	/// n-D vectorization strategy (see doc in VectorizationStrategy definition).			/// n-D vectorization strategy (see doc in VectorizationStrategy definition).
	/// Loops are provided in a 2D vector container. The first dimension represents			/// Loops are provided in a 2D vector container. The first dimension represents
	/// the nesting level relative to the loops to be vectorized. The second			/// the nesting level relative to the loops to be vectorized. The second
	/// dimension contains the loops. This means that:			/// dimension contains the loops. This means that:
	/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',			/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',
	/// b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'.			/// b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'.
	▲ Show 20 Lines • Show All 49 Lines • Show Last 20 Lines

mlir/include/mlir/Transforms/ReductionUtils.h

This file was added.

				//===- ReductionUtils.h - Reduction-related utilities ------------ C++ --===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This header file declares utilities for recognizing and manipulating
				// reductions.
				//
				//===----------------------------------------------------------------------===//

				#ifndef MLIR_TRANSFORMS_REDUCTION_UTILS_H
				#define MLIR_TRANSFORMS_REDUCTION_UTILS_H

				#include "mlir/Dialect/Affine/IR/AffineOps.h"
				#include "mlir/IR/Value.h"
				#include "mlir/Support/LLVM.h"

				namespace mlir {

				class OpBuilder;

				/// This abstract class is a collection of utilities needed for transforming
				/// (vectorizing etc) a specific kind of reduction (like sum, product, etc).
				/// TODO: Support more utilities like generating `affine.parallel` and
				/// `atomic_rmw`.
				class ReductionInfo {
				public:
				/// Creates an operation that takes the vector value `vector` and reduces it
				/// into a scalar, for example:
				/// %res = vector.reduction "add", %vector : vector<128xf32> into f32
				virtual Value createVectorReduction(Value vector,
				OpBuilder &builder) const = 0;

				/// Creates an operation that combines scalar values `lhs` and `rhs`, e.g.:
				/// %res = addf %lhs, %rhs : f32
				virtual Value combine(Value lhs, Value rhs, OpBuilder &builder) const = 0;

				/// Creates an attribute of type `elemType` representing the neutral element
				/// of this reduction.
				virtual Attribute getNeutralElementAttr(Type elemType,
				OpBuilder &builder) const = 0;

				virtual ~ReductionInfo() = default;
				};

				/// The base class for reduction recognizers that check if the given
				/// loop-carried variable represents a known reduction kind.
				class ReductionRecognizer {
				public:
				/// Checks if the loop-carried variable represented by the argument passed to
				/// the current iteration `arg` and the value passed to the next interation
				/// `yielded` computes a known reduction. Returns an instance of
				/// `ReductionInfo` for this reduction kind if the reduction is recognized and
				/// `nullptr` if it's not.
				virtual const ReductionInfo *recognize(BlockArgument arg,
				Value yielded) const = 0;
				virtual ~ReductionRecognizer() = default;
				};

				/// A recognizer that rejects everything.
				class NullReductionRecognizer : public ReductionRecognizer {
				public:
				virtual const ReductionInfo *recognize(BlockArgument arg,
				Value yielded) const override {
				return nullptr;
				}
				};

				/// An implementation of `ReductionInfo` for standard reductions implementable
				/// with a single operation `Op`.
				template <class Op>
				class StandardReductionInfo : public ReductionInfo {
				public:
				virtual Value createVectorReduction(Value vector,
				OpBuilder &builder) const override;
				virtual Value combine(Value lhs, Value rhs,
				OpBuilder &builder) const override;
				virtual Attribute getNeutralElementAttr(Type elemType,
				OpBuilder &builder) const override;

				/// Returns an instance of this class.
				static ReductionInfo *get();

				/// The string used as the kind for `vector.reduction`.
				static const char *const kindString;
				};

				/// A reduction recognizer that recognizes standard parallelizable reductions.
				/// Currently supports addf, mulf, addi, muli.
				/// TODO: Support max and min.
				class StandardReductionRecognizer : public ReductionRecognizer {
				public:
				virtual const ReductionInfo *recognize(BlockArgument arg,
				Value yielded) const override;

				protected:
				Operation *getSingleOpCombiner(BlockArgument arg, Value yielded) const;
				virtual const ReductionInfo *recognizeSingleOpCombiner(BlockArgument arg,
				Value yielded) const;
				};

				/// Returns true if `forOp` is a parallel loop possibly implementing known
				/// reductions via loop-carried variables (iter_args). Reductions are considered
				/// known (and parallel) if they are recognized by `reductionRecognizer`.
				bool isParallelReductionLoop(AffineForOp forOp,
				const ReductionRecognizer &reductionRecognizer);

				/// Populates `reductions` with the information about known reductions
				/// implemented by `forOp`. Reductions are considered known if they are
				/// recognized by `reductionRecognizer`. Returns `true` if all iteration
				/// variables implement recognizable reductions and `false` otherwise.
				bool getKnownReductions(AffineForOp forOp,
				const ReductionRecognizer &reductionRecognizer,
				SmallVectorImpl<const ReductionInfo *> &reductions);

				} // end namespace mlir

				#endif // MLIR_TRANSFORMS_REDUCTION_UTILS_H

mlir/lib/Analysis/Utils.cpp

Show First 20 Lines • Show All 1,262 Lines • ▼ Show 20 Lines	void mlir::getSequentialLoops(AffineForOp forOp,
llvm::SmallDenseSet<Value, 8> *sequentialLoops) {		llvm::SmallDenseSet<Value, 8> *sequentialLoops) {
forOp->walk([&](Operation *op) {		forOp->walk([&](Operation *op) {
if (auto innerFor = dyn_cast<AffineForOp>(op))		if (auto innerFor = dyn_cast<AffineForOp>(op))
if (!isLoopParallel(innerFor))		if (!isLoopParallel(innerFor))
sequentialLoops->insert(innerFor.getInductionVar());		sequentialLoops->insert(innerFor.getInductionVar());
});		});
}		}

/// Returns true if 'forOp' is parallel.		/// Returns true if `forOp' is a parallel loop. By default loops with
bool mlir::isLoopParallel(AffineForOp forOp) {		/// loop-carried variables (iter_args) are considered non-parallel, unless
// Loop is not parallel if it has SSA loop-carried dependences.		/// `ignoreIterArgs = true` is provided, in which case only memory operations
// TODO: Conditionally support reductions and other loop-carried dependences		/// are checked. See also `mlir::isParallelReductionLoop`.
// that could be handled in the context of a parallel loop.		bool mlir::isLoopParallel(AffineForOp forOp, bool ignoreIterArgs) {
if (forOp.getNumIterOperands() > 0)		// Loop is not parallel if it has SSA loop-carried dependences (unless it is
		// explicitly requested to ignore them).
		if (!ignoreIterArgs && forOp.getNumIterOperands() > 0)
return false;		return false;

// Collect all load and store ops in loop nest rooted at 'forOp'.		// Collect all load and store ops in loop nest rooted at 'forOp'.
SmallVector<Operation *, 8> loadAndStoreOpInsts;		SmallVector<Operation *, 8> loadAndStoreOpInsts;
auto walkResult = forOp.walk([&](Operation *opInst) -> WalkResult {		auto walkResult = forOp.walk([&](Operation *opInst) -> WalkResult {
if (isa<AffineReadOpInterface, AffineWriteOpInterface>(opInst))		if (isa<AffineReadOpInterface, AffineWriteOpInterface>(opInst))
loadAndStoreOpInsts.push_back(opInst);		loadAndStoreOpInsts.push_back(opInst);
else if (!isa<AffineForOp, AffineYieldOp, AffineIfOp>(opInst) &&		else if (!isa<AffineForOp, AffineYieldOp, AffineIfOp>(opInst) &&
Show All 40 Lines

mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp

Show All 15 Lines
#include "mlir/Analysis/NestedMatcher.h"		#include "mlir/Analysis/NestedMatcher.h"
#include "mlir/Analysis/Utils.h"		#include "mlir/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"		#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/Utils.h"		#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Vector/VectorOps.h"		#include "mlir/Dialect/Vector/VectorOps.h"
#include "mlir/Dialect/Vector/VectorUtils.h"		#include "mlir/Dialect/Vector/VectorUtils.h"
#include "mlir/IR/BlockAndValueMapping.h"		#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/Support/LLVM.h"		#include "mlir/Support/LLVM.h"
		#include "mlir/Transforms/ReductionUtils.h"
		#include "llvm/ADT/STLExtras.h"

using namespace mlir;		using namespace mlir;
using namespace vector;		using namespace vector;

///		///
/// Implements a high-level vectorization strategy on a Function.		/// Implements a high-level vectorization strategy on a Function.
/// The abstraction used is that of super-vectors, which provide a single,		/// The abstraction used is that of super-vectors, which provide a single,
/// compact, representation in the vector types, information that is expected		/// compact, representation in the vector types, information that is expected
▲ Show 20 Lines • Show All 286 Lines • ▼ Show 20 Lines
/// affine.for %i = %M to %N step 128 {		/// affine.for %i = %M to %N step 128 {
/// %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32>		/// %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32>
/// }		/// }
/// ```		/// ```
///		///
/// Unsupported cases, extensions, and work in progress (help welcome :-) ):		/// Unsupported cases, extensions, and work in progress (help welcome :-) ):
/// ========================================================================		/// ========================================================================
/// 1. lowering to concrete vector types for various HW;		/// 1. lowering to concrete vector types for various HW;
/// 2. reduction support;		/// 2. reduction support for n-D vectorization and non-unit steps;
/// 3. non-effecting padding during vector.transfer_read and filter during		/// 3. non-effecting padding during vector.transfer_read and filter during
/// vector.transfer_write;		/// vector.transfer_write;
/// 4. misalignment support vector.transfer_read / vector.transfer_write		/// 4. misalignment support vector.transfer_read / vector.transfer_write
/// (hopefully without read-modify-writes);		/// (hopefully without read-modify-writes);
/// 5. control-flow support;		/// 5. control-flow support;
/// 6. cost-models, heuristics and search;		/// 6. cost-models, heuristics and search;
/// 7. Op implementation, extensions and implication on memref views;		/// 7. Op implementation, extensions and implication on memref views;
/// 8. many TODOs left around.		/// 8. many TODOs left around.
▲ Show 20 Lines • Show All 146 Lines • ▼ Show 20 Lines
/// %c42 = constant 42 : index		/// %c42 = constant 42 : index
/// %9 = load %2[%c7, %c42] : memref<?x?xf32>		/// %9 = load %2[%c7, %c42] : memref<?x?xf32>
/// return %9 : f32		/// return %9 : f32
/// }		/// }
/// ```		/// ```
///		///
/// Of course, much more intricate n-D imperfectly-nested patterns can be		/// Of course, much more intricate n-D imperfectly-nested patterns can be
/// vectorized too and specified in a fully declarative fashion.		/// vectorized too and specified in a fully declarative fashion.
		///
		/// Reduction:
		/// ==========
		/// Vectorizing reduction loops along the reduction dimension is supported if:
		/// - the reduction is recognizable (see `ReductionRecognizer`),
		/// - the vectorization is 1-D, and
		/// - the step size of the loop equals to one.
		///
		/// Comparing to the non-vector-dimension case, two additional things are done
		/// during vectorization of such loops:
		/// - The resulting vector returned from the loop is reduced to a scalar using
		/// `vector.reduce`.
		/// - In some cases a mask is applied to the vector yielded at the end of the
		/// loop to prevent garbage values from being written to the accumulator.
		///
		/// Reduction vectorization is switched off by default, it can be enabled by
		/// passing a reduction recognizer to utility functions, or by passing
		/// `vectorize-reductions=true` to the vectorization pass.
		///
		/// Consider the following example:
		/// ```mlir
		/// func @vecred(%in: memref<512xf32>) -> f32 {
		/// %cst = constant 0.000000e+00 : f32
		/// %sum = affine.for %i = 0 to 500 iter_args(%part_sum = %cst) -> (f32) {
		/// %ld = affine.load %in[%i] : memref<512xf32>
		/// %cos = math.cos %ld : f32
		/// %add = addf %part_sum, %cos : f32
		/// affine.yield %add : f32
		/// }
		/// return %sum : f32
		/// }
		/// ```
		///
		/// The -affine-vectorize pass with the following arguments:
		/// ```
		/// -affine-vectorize="virtual-vector-size=128 test-fastest-varying=0 \
		/// vectorize-reductions=true"
		/// ```
		/// produces the following output:
		/// ```mlir
		/// #map = affine_map<(d0) -> (-d0 + 500)>
		/// func @vecred(%arg0: memref<512xf32>) -> f32 {
		/// %cst = constant 0.000000e+00 : f32
		/// %cst_0 = constant dense<0.000000e+00> : vector<128xf32>
		/// %0 = affine.for %arg1 = 0 to 500 step 128 iter_args(%arg2 = %cst_0)
		/// -> (vector<128xf32>) {
		/// // %2 is the number of iterations left in the original loop.
		/// %2 = affine.apply #map(%arg1)
		/// %3 = vector.create_mask %2 : vector<128xi1>
		/// %cst_1 = constant 0.000000e+00 : f32
		/// %4 = vector.transfer_read %arg0[%arg1], %cst_1 :
		/// memref<512xf32>, vector<128xf32>
		/// %5 = math.cos %4 : vector<128xf32>
		/// %6 = addf %arg2, %5 : vector<128xf32>
		/// // We filter out the effect of last 12 elements using the mask.
		/// %7 = select %3, %6, %arg2 : vector<128xi1>, vector<128xf32>
		/// affine.yield %7 : vector<128xf32>
		/// }
		/// %1 = vector.reduction "add", %0 : vector<128xf32> into f32
		/// return %1 : f32
		/// }
		/// ```
		///
		/// Note that because of loop misalignment we needed to apply a mask to prevent
		/// last 12 elements from affecting the final result. The mask is full of ones
		/// in every iteration except for the last one, in which it has the form
		/// `11...100...0` with 116 ones and 12 zeros.

#define DEBUG_TYPE "early-vect"		#define DEBUG_TYPE "early-vect"

using llvm::dbgs;		using llvm::dbgs;

/// Forward declaration.		/// Forward declaration.
static FilterFunctionType		static FilterFunctionType
isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,		isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,
▲ Show 20 Lines • Show All 141 Lines • ▼ Show 20 Lines	struct VectorizationState {
/// within the vector loop.		/// within the vector loop.
///		///
/// Example:		/// Example:
/// * 'replaced': induction variable of a loop to be vectorized.		/// * 'replaced': induction variable of a loop to be vectorized.
/// * 'replacement': new induction variable in the new vector loop.		/// * 'replacement': new induction variable in the new vector loop.
void registerValueScalarReplacement(BlockArgument replaced,		void registerValueScalarReplacement(BlockArgument replaced,
BlockArgument replacement);		BlockArgument replacement);

		/// Registers the scalar replacement of a scalar result returned from a
		/// reduction loop. 'replacement' must be scalar.
		///
		/// This utility is used to register the replacement for scalar results of
		/// vectorized reduction loops with iter_args.
		///
		/// Example 2:
		/// * 'replaced': %0 = affine.for %i = 0 to 512 iter_args(%x = ...) -> (f32)
		/// * 'replacement': %1 = vector.reduction "add" %0 : vector<4xf32> into f32
		void registerLoopResultScalarReplacement(Value replaced, Value replacement);

/// Returns in 'replacedVals' the scalar replacement for values in		/// Returns in 'replacedVals' the scalar replacement for values in
/// 'inputVals'.		/// 'inputVals'.
void getScalarValueReplacementsFor(ValueRange inputVals,		void getScalarValueReplacementsFor(ValueRange inputVals,
SmallVectorImpl<Value> &replacedVals);		SmallVectorImpl<Value> &replacedVals);

/// Erases the scalar loop nest after its successful vectorization.		/// Erases the scalar loop nest after its successful vectorization.
void finishVectorizationPattern(AffineForOp rootLoop);		void finishVectorizationPattern(AffineForOp rootLoop);

// Used to build and insert all the new operations created. The insertion		// Used to build and insert all the new operations created. The insertion
// point is preserved and updated along the vectorization process.		// point is preserved and updated along the vectorization process.
OpBuilder builder;		OpBuilder builder;

// Maps input scalar operations to their vector counterparts.		// Maps input scalar operations to their vector counterparts.
DenseMap<Operation , Operation > opVectorReplacement;		DenseMap<Operation , Operation > opVectorReplacement;
// Maps input scalar values to their vector counterparts.		// Maps input scalar values to their vector counterparts.
BlockAndValueMapping valueVectorReplacement;		BlockAndValueMapping valueVectorReplacement;
// Maps input scalar values to their new scalar counterparts in the vector		// Maps input scalar values to their new scalar counterparts in the vector
// loop nest.		// loop nest.
BlockAndValueMapping valueScalarReplacement;		BlockAndValueMapping valueScalarReplacement;
		// Maps results of reduction loops to their new scalar counterparts.
		DenseMap<Value, Value> loopResultScalarReplacement;

// Maps the newly created vector loops to their vector dimension.		// Maps the newly created vector loops to their vector dimension.
DenseMap<Operation *, unsigned> vecLoopToVecDim;		DenseMap<Operation *, unsigned> vecLoopToVecDim;

		// Maps the new vectorized loops to the corresponding vector masks if it is
		// required.
		DenseMap<Operation *, Value> vecLoopToMask;

// The strategy drives which loop to vectorize by which amount.		// The strategy drives which loop to vectorize by which amount.
const VectorizationStrategy *strategy;		const VectorizationStrategy *strategy;

private:		private:
/// Internal implementation to map input scalar values to new vector or scalar		/// Internal implementation to map input scalar values to new vector or scalar
/// values.		/// values.
void registerValueVectorReplacementImpl(Value replaced, Value replacement);		void registerValueVectorReplacementImpl(Value replaced, Value replacement);
void registerValueScalarReplacementImpl(Value replaced, Value replacement);		void registerValueScalarReplacementImpl(Value replaced, Value replacement);
▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines
/// Example:		/// Example:
/// * 'replaced': induction variable of a loop to be vectorized.		/// * 'replaced': induction variable of a loop to be vectorized.
/// * 'replacement': new induction variable in the new vector loop.		/// * 'replacement': new induction variable in the new vector loop.
void VectorizationState::registerValueScalarReplacement(		void VectorizationState::registerValueScalarReplacement(
BlockArgument replaced, BlockArgument replacement) {		BlockArgument replaced, BlockArgument replacement) {
registerValueScalarReplacementImpl(replaced, replacement);		registerValueScalarReplacementImpl(replaced, replacement);
}		}

		/// Registers the scalar replacement of a scalar result returned from a
		/// reduction loop. 'replacement' must be scalar.
		///
		/// This utility is used to register the replacement for scalar results of
		/// vectorized reduction loops with iter_args.
		///
		/// Example 2:
		/// * 'replaced': %0 = affine.for %i = 0 to 512 iter_args(%x = ...) -> (f32)
		/// * 'replacement': %1 = vector.reduction "add" %0 : vector<4xf32> into f32
		void VectorizationState::registerLoopResultScalarReplacement(
		Value replaced, Value replacement) {
		assert(isa<AffineForOp>(replaced.getDefiningOp()));
		assert(loopResultScalarReplacement.count(replaced) == 0 &&
		"already registered");
		LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ will replace a result of the loop "
		"with scalar: "
		<< replacement);
		loopResultScalarReplacement[replaced] = replacement;
		}

void VectorizationState::registerValueScalarReplacementImpl(Value replaced,		void VectorizationState::registerValueScalarReplacementImpl(Value replaced,
Value replacement) {		Value replacement) {
assert(!valueScalarReplacement.contains(replaced) &&		assert(!valueScalarReplacement.contains(replaced) &&
"Scalar value replacement already registered");		"Scalar value replacement already registered");
assert(!replacement.getType().isa<VectorType>() &&		assert(!replacement.getType().isa<VectorType>() &&
"Expected scalar type in scalar replacement");		"Expected scalar type in scalar replacement");
valueScalarReplacement.map(replaced, replacement);		valueScalarReplacement.map(replaced, replacement);
}		}
▲ Show 20 Lines • Show All 73 Lines • ▼ Show 20 Lines	static ConstantOp vectorizeConstant(ConstantOp constOp,
auto vecAttr = DenseElementsAttr::get(vecTy, constOp.getValue());		auto vecAttr = DenseElementsAttr::get(vecTy, constOp.getValue());
auto newConstOp = state.builder.create<ConstantOp>(constOp.getLoc(), vecAttr);		auto newConstOp = state.builder.create<ConstantOp>(constOp.getLoc(), vecAttr);

// Register vector replacement for future uses in the scope.		// Register vector replacement for future uses in the scope.
state.registerOpVectorReplacement(constOp, newConstOp);		state.registerOpVectorReplacement(constOp, newConstOp);
return newConstOp;		return newConstOp;
}		}

		/// Creates a constant vector filled with the neutral elements of the given
		/// reduction. The scalar type of vector elements will be taken from
		/// `oldOperand`.
		static ConstantOp createInitialVector(const ReductionInfo *reduction,
		Value oldOperand,
		VectorizationState &state) {
		Type scalarTy = oldOperand.getType();
		if (!VectorType::isValidElementType(scalarTy))
		return nullptr;

		Attribute valueAttr =
		reduction->getNeutralElementAttr(scalarTy, state.builder);
		auto vecTy = getVectorType(scalarTy, state.strategy);
		auto vecAttr = DenseElementsAttr::get(vecTy, valueAttr);
		auto newConstOp =
		state.builder.create<ConstantOp>(oldOperand.getLoc(), vecAttr);

		return newConstOp;
		}

		/// Creates a mask used to filter out garbage elements in the last iteration
		/// of unaligned loops. If a mask is not required then `nullptr` is returned.
		/// The mask will be a vector of booleans representing meaningful vector
		/// elements in the current iteration. It is filled with ones for each iteration
		/// except for the last one, where it has the form `11...100...0` with the
		/// number of ones equal to the number of meaningful elements (i.e. the number
		/// of iterations that would be left in the original loop).
		static Value createMask(AffineForOp vecForOp, VectorizationState &state) {
		assert(state.strategy->vectorSizes.size() == 1 &&
		"Creating a mask non-1-D vectors is not supported.");
		assert(vecForOp.getStep() == state.strategy->vectorSizes[0] &&
		"Creating a mask for loops with non-unit original step size is not "
		"supported.");

		// Check if we have already created the mask.
		if (Value mask = state.vecLoopToMask.lookup(vecForOp))
		return mask;

		// If the loop has constant bounds and the original number of iterations is
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions Note: there is also the case where all the `vector.transfer_read` in the backward slice of the reduction (intersected with the filter "nested under loop of interest") have `inBounds == true`. This is the way we can inject static information into dynamic memrefs atm. nicolasvasilache: Note: there is also the case where all the `vector.transfer_read` in the backward slice of the…
		sgrechanikAuthorUnsubmitted Done Reply Inline Actions Not sure I understand your comment correctly, but there may be cases when the read is within bounds, but we still need to filter out some elements for the reduction (e.g. when the dimension size of a memref is larger than the upper bound of the corresponding loop). sgrechanik: Not sure I understand your comment correctly, but there may be cases when the read is within…
		// divisable by the vector size then we don't need a mask.
		if (vecForOp.hasConstantBounds()) {
		int64_t originalTripCount =
		vecForOp.getConstantUpperBound() - vecForOp.getConstantLowerBound();
		if (originalTripCount % vecForOp.getStep() == 0)
		return nullptr;
		}

		OpBuilder::InsertionGuard guard(state.builder);
		state.builder.setInsertionPointToStart(vecForOp.getBody());

		// We generate the mask using the `vector.create_mask` operation which accepts
		// the number of meaningful elements (i.e. the legth of the prefix of 1s).
		// To compute the number of meaningful elements we subtract the current value
		// of the iteration variable from the upper bound of the loop. Example:
		//
		// // 500 is the upper bound of the loop
		// #map = affine_map<(d0) -> (500 - d0)>
		// %elems_left = affine.apply #map(%iv)
		// %mask = vector.create_mask %elems_left : vector<128xi1>

		Location loc = vecForOp.getLoc();

		// First we get the upper bound of the loop using `affine.apply` or
		// `affine.min`.
		AffineMap ubMap = vecForOp.getUpperBoundMap();
		Value ub;
		if (ubMap.getNumResults() == 1)
		ub = state.builder.create<AffineApplyOp>(loc, vecForOp.getUpperBoundMap(),
		vecForOp.getUpperBoundOperands());
		else
		ub = state.builder.create<AffineMinOp>(loc, vecForOp.getUpperBoundMap(),
		vecForOp.getUpperBoundOperands());
		// Then we compute the number of (original) iterations left in the loop.
		AffineExpr subExpr =
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions for composability you prob. want this to be an AffineApplyOp itself. nicolasvasilache: for composability you prob. want this to be an AffineApplyOp itself.
		sgrechanikAuthorUnsubmitted Done Reply Inline Actions Yes, makes sense. sgrechanik: Yes, makes sense.
		state.builder.getAffineDimExpr(0) - state.builder.getAffineDimExpr(1);
		Value itersLeft =
		makeComposedAffineApply(state.builder, loc, AffineMap::get(2, 0, subExpr),
		{ub, vecForOp.getInductionVar()});
		// If the affine maps were successfully composed then `ub` is unneeded.
		if (ub.use_empty())
		ub.getDefiningOp()->erase();
		// Finally we create the mask.
		Type maskTy = VectorType::get(state.strategy->vectorSizes,
		state.builder.getIntegerType(1));
		Value mask =
		state.builder.create<vector::CreateMaskOp>(loc, maskTy, itersLeft);

		LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a mask:\n"
		<< itersLeft << "\n"
		<< mask << "\n");

		state.vecLoopToMask[vecForOp] = mask;
		return mask;
		}

/// Returns true if the provided value is vector uniform given the vectorization		/// Returns true if the provided value is vector uniform given the vectorization
/// strategy.		/// strategy.
// TODO: For now, only values that are invariants to all the loops in the		// TODO: For now, only values that are invariants to all the loops in the
// vectorization strategy are considered vector uniforms.		// vectorization strategy are considered vector uniforms.
static bool isUniformDefinition(Value value,		static bool isUniformDefinition(Value value,
const VectorizationStrategy *strategy) {		const VectorizationStrategy *strategy) {
for (auto loopToDim : strategy->loopToVectorDim) {		for (auto loopToDim : strategy->loopToVectorDim) {
auto loop = cast<AffineForOp>(loopToDim.first);		auto loop = cast<AffineForOp>(loopToDim.first);
▲ Show 20 Lines • Show All 157 Lines • ▼ Show 20 Lines	auto transfer = state.builder.create<vector::TransferWriteOp>(
permutationMap);		permutationMap);
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << transfer);		LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << transfer);

// Register replacement for future uses in the scope.		// Register replacement for future uses in the scope.
state.registerOpVectorReplacement(storeOp, transfer);		state.registerOpVectorReplacement(storeOp, transfer);
return transfer;		return transfer;
}		}

		/// Returns true if `value` is a constant equal to the neutral element of the
		/// given vectorizable reduction.
		static bool isNeutralElementConst(const ReductionInfo *reduction, Value value,
		VectorizationState &state) {
		Type scalarTy = value.getType();
		if (!VectorType::isValidElementType(scalarTy))
		return false;
		Attribute valueAttr =
		reduction->getNeutralElementAttr(scalarTy, state.builder);
		if (auto constOp = dyn_cast_or_null<ConstantOp>(value.getDefiningOp()))
		return constOp.value() == valueAttr;
		return false;
		}

/// Vectorizes a loop with the vectorization strategy in 'state'. A new loop is		/// Vectorizes a loop with the vectorization strategy in 'state'. A new loop is
/// created and registered as replacement for the scalar loop. The builder's		/// created and registered as replacement for the scalar loop. The builder's
/// insertion point is set to the new loop's body so that subsequent vectorized		/// insertion point is set to the new loop's body so that subsequent vectorized
/// operations are inserted into the new loop. If the loop is a vector		/// operations are inserted into the new loop. If the loop is a vector
/// dimension, the step of the newly created loop will reflect the vectorization		/// dimension, the step of the newly created loop will reflect the vectorization
/// factor used to vectorized that dimension.		/// factor used to vectorized that dimension.
// TODO: Add support for 'iter_args'. Related operands and results will be
// vectorized at this point.
static Operation *vectorizeAffineForOp(AffineForOp forOp,		static Operation *vectorizeAffineForOp(AffineForOp forOp,
VectorizationState &state) {		VectorizationState &state) {
const VectorizationStrategy &strategy = *state.strategy;		const VectorizationStrategy &strategy = *state.strategy;
auto loopToVecDimIt = strategy.loopToVectorDim.find(forOp);		auto loopToVecDimIt = strategy.loopToVectorDim.find(forOp);
bool isLoopVecDim = loopToVecDimIt != strategy.loopToVectorDim.end();		bool isLoopVecDim = loopToVecDimIt != strategy.loopToVectorDim.end();

// We only support 'iter_args' when the loop is not one of the vector		// TODO: Vectorization of reduction loops is not supported for non-unit steps.
// dimensions.		if (isLoopVecDim && forOp.getNumIterOperands() > 0 && forOp.getStep() != 1) {
// TODO: Support vector dimension loops. They require special handling:		LLVM_DEBUG(
// generate horizontal reduction, last-value extraction, etc.		dbgs()
if (forOp.getNumIterOperands() > 0 && isLoopVecDim)		<< "\n[early-vect]+++++ unsupported step size for reduction loop: "
		<< forOp.getStep() << "\n");
return nullptr;		return nullptr;
		}

// If we are vectorizing a vector dimension, compute a new step for the new		// If we are vectorizing a vector dimension, compute a new step for the new
// vectorized loop using the vectorization factor for the vector dimension.		// vectorized loop using the vectorization factor for the vector dimension.
// Otherwise, propagate the step of the scalar loop.		// Otherwise, propagate the step of the scalar loop.
unsigned newStep;		unsigned newStep;
if (isLoopVecDim) {		if (isLoopVecDim) {
unsigned vectorDim = loopToVecDimIt->second;		unsigned vectorDim = loopToVecDimIt->second;
assert(vectorDim < strategy.vectorSizes.size() && "vector dim overflow");		assert(vectorDim < strategy.vectorSizes.size() && "vector dim overflow");
int64_t forOpVecFactor = strategy.vectorSizes[vectorDim];		int64_t forOpVecFactor = strategy.vectorSizes[vectorDim];
newStep = forOp.getStep() * forOpVecFactor;		newStep = forOp.getStep() * forOpVecFactor;
} else {		} else {
newStep = forOp.getStep();		newStep = forOp.getStep();
}		}

		// Get information about recognized reduction kinds.
		SmallVector<const ReductionInfo *, 4> reductions;
		if (isLoopVecDim && forOp.getNumIterOperands() > 0) {
		if (!strategy.reductionRecognizer) {
		LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ cannot vectorize loop with "
		"iter_args: no recognizer provided\n");
		return nullptr;
		}
		bool allKnown =
		getKnownReductions(forOp, *strategy.reductionRecognizer, reductions);
		if (!allKnown) {
		LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ cannot vectorize loop with "
		"iter_args: some reductions are not recognized\n");
		return nullptr;
		}
		}

// Vectorize 'iter_args'.		// Vectorize 'iter_args'.
SmallVector<Value, 8> vecIterOperands;		SmallVector<Value, 8> vecIterOperands;
		if (!isLoopVecDim) {
for (auto operand : forOp.getIterOperands())		for (auto operand : forOp.getIterOperands())
vecIterOperands.push_back(vectorizeOperand(operand, state));		vecIterOperands.push_back(vectorizeOperand(operand, state));
		} else {
		// For reduction loops we need to pass a vector of neutral elements as an
		// initial value of the accumulator. We will add the original initial value
		// later.
		for (auto redAndOperand : llvm::zip(reductions, forOp.getIterOperands())) {
		vecIterOperands.push_back(createInitialVector(
		std::get<0>(redAndOperand), std::get<1>(redAndOperand), state));
		}
		}

auto vecForOp = state.builder.create<AffineForOp>(		auto vecForOp = state.builder.create<AffineForOp>(
forOp.getLoc(), forOp.getLowerBoundOperands(), forOp.getLowerBoundMap(),		forOp.getLoc(), forOp.getLowerBoundOperands(), forOp.getLowerBoundMap(),
forOp.getUpperBoundOperands(), forOp.getUpperBoundMap(), newStep,		forOp.getUpperBoundOperands(), forOp.getUpperBoundMap(), newStep,
vecIterOperands,		vecIterOperands,
/bodyBuilder=/[](OpBuilder &, Location, Value, ValueRange) {		/bodyBuilder=/[](OpBuilder &, Location, Value, ValueRange) {
// Make sure we don't create a default terminator in the loop body as		// Make sure we don't create a default terminator in the loop body as
// the proper terminator will be added during vectorization.		// the proper terminator will be added during vectorization.
return;		return;
});		});

// Register loop-related replacements:		// Register loop-related replacements:
// 1) The new vectorized loop is registered as vector replacement of the		// 1) The new vectorized loop is registered as vector replacement of the
// scalar loop.		// scalar loop.
// TODO: Support reductions along the vector dimension.
// 2) The new iv of the vectorized loop is registered as scalar replacement		// 2) The new iv of the vectorized loop is registered as scalar replacement
// since a scalar copy of the iv will prevail in the vectorized loop.		// since a scalar copy of the iv will prevail in the vectorized loop.
// TODO: A vector replacement will also be added in the future when		// TODO: A vector replacement will also be added in the future when
// vectorization of linear ops is supported.		// vectorization of linear ops is supported.
// 3) The new 'iter_args' region arguments are registered as vector		// 3) The new 'iter_args' region arguments are registered as vector
// replacements since they have been vectorized.		// replacements since they have been vectorized.
		// 4) If the loop performs a reduction along the vector dimension, a
		// `vector.reduction` or similar op is inserted for each resulting value
		// of the loop and its scalar value replaces the corresponding scalar
		// result of the loop.
state.registerOpVectorReplacement(forOp, vecForOp);		state.registerOpVectorReplacement(forOp, vecForOp);
state.registerValueScalarReplacement(forOp.getInductionVar(),		state.registerValueScalarReplacement(forOp.getInductionVar(),
vecForOp.getInductionVar());		vecForOp.getInductionVar());
for (auto iterTuple :		for (auto iterTuple :
llvm ::zip(forOp.getRegionIterArgs(), vecForOp.getRegionIterArgs()))		llvm ::zip(forOp.getRegionIterArgs(), vecForOp.getRegionIterArgs()))
state.registerBlockArgVectorReplacement(std::get<0>(iterTuple),		state.registerBlockArgVectorReplacement(std::get<0>(iterTuple),
std::get<1>(iterTuple));		std::get<1>(iterTuple));

		if (isLoopVecDim) {
		for (unsigned i = 0; i < vecForOp.getNumIterOperands(); ++i) {
		// First, we reduce the vector returned from the loop into a scalar.
		Value reducedRes = reductions[i]->createVectorReduction(
		vecForOp.getResult(i), state.builder);
		LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a vector reduction: "
		<< reducedRes);
		// Then we combine it with the original (scalar) initial value unless it
		// is equal to the neutral element of the reduction.
		Value origInit = forOp.getOperand(forOp.getNumControlOperands() + i);
		Value finalRes = reducedRes;
		if (!isNeutralElementConst(reductions[i], origInit, state))
		finalRes = reductions[i]->combine(reducedRes, origInit, state.builder);
		state.registerLoopResultScalarReplacement(forOp.getResult(i), finalRes);
		}
		}

if (isLoopVecDim)		if (isLoopVecDim)
state.vecLoopToVecDim[vecForOp] = loopToVecDimIt->second;		state.vecLoopToVecDim[vecForOp] = loopToVecDimIt->second;

// Change insertion point so that upcoming vectorized instructions are		// Change insertion point so that upcoming vectorized instructions are
// inserted into the vectorized loop's body.		// inserted into the vectorized loop's body.
state.builder.setInsertionPointToStart(vecForOp.getBody());		state.builder.setInsertionPointToStart(vecForOp.getBody());

		// If this is a reduction loop then we may need to create a mask to filter out
		// garbage in the last iteration.
		if (isLoopVecDim && forOp.getNumIterOperands() > 0)
		createMask(vecForOp, state);

return vecForOp;		return vecForOp;
}		}

/// Vectorizes arbitrary operation by plain widening. We apply generic type		/// Vectorizes arbitrary operation by plain widening. We apply generic type
/// widening of all its results and retrieve the vector counterparts for all its		/// widening of all its results and retrieve the vector counterparts for all its
/// operands.		/// operands.
static Operation widenOp(Operation op, VectorizationState &state) {		static Operation widenOp(Operation op, VectorizationState &state) {
SmallVector<Type, 8> vectorTypes;		SmallVector<Type, 8> vectorTypes;
Show All 21 Lines	OperationState vecOpState(op->getLoc(), op->getName().getStringRef(),
/successors=/{}, /regions=/{});		/successors=/{}, /regions=/{});
Operation *vecOp = state.builder.createOperation(vecOpState);		Operation *vecOp = state.builder.createOperation(vecOpState);
state.registerOpVectorReplacement(op, vecOp);		state.registerOpVectorReplacement(op, vecOp);
return vecOp;		return vecOp;
}		}

/// Vectorizes a yield operation by widening its types. The builder's insertion		/// Vectorizes a yield operation by widening its types. The builder's insertion
/// point is set after the vectorized parent op to continue vectorizing the		/// point is set after the vectorized parent op to continue vectorizing the
/// operations after the parent op.		/// operations after the parent op. When vectorizing a reduction loop a mask may
		/// be used to prevent adding garbage values to the accumulator.
static Operation *vectorizeAffineYieldOp(AffineYieldOp yieldOp,		static Operation *vectorizeAffineYieldOp(AffineYieldOp yieldOp,
VectorizationState &state) {		VectorizationState &state) {
Operation *newYieldOp = widenOp(yieldOp, state);		Operation *newYieldOp = widenOp(yieldOp, state);
Operation *newParentOp = state.builder.getInsertionBlock()->getParentOp();		Operation *newParentOp = state.builder.getInsertionBlock()->getParentOp();

		// If there is a mask for this loop then we must prevent garbage values from
		// being added to the accumulator by inserting `select` operations, for
		// example:
		//
		// %res = addf %acc, %val : vector<128xf32>
		// %res_masked = select %mask, %res, %acc : vector<128xi1>, vector<128xf32>
		// affine.yield %res_masked : vector<128xf32>
		//
		if (Value mask = state.vecLoopToMask.lookup(newParentOp)) {
		state.builder.setInsertionPoint(newYieldOp);
		for (unsigned i = 0; i < newYieldOp->getNumOperands(); ++i) {
		Value result = newYieldOp->getOperand(i);
		Value iterArg = cast<AffineForOp>(newParentOp).getRegionIterArgs()[i];
		Value maskedResult = state.builder.create<SelectOp>(result.getLoc(), mask,
		result, iterArg);
		LLVM_DEBUG(
		dbgs() << "\n[early-vect]+++++ masking a yielded vector value: "
		<< maskedResult);
		newYieldOp->setOperand(i, maskedResult);
		}
		}

state.builder.setInsertionPointAfter(newParentOp);		state.builder.setInsertionPointAfter(newParentOp);
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions Note: some of this is quite ancient and predates `OpBuilder::InsertionGuard`. We should cleanup the load-bearing "insertion point"-passing across function boundaries via `state.builder` at some point. nicolasvasilache: Note: some of this is quite ancient and predates `OpBuilder::InsertionGuard`. We should cleanup…
return newYieldOp;		return newYieldOp;
}		}

/// Encodes Operation-specific behavior for vectorization. In general we		/// Encodes Operation-specific behavior for vectorization. In general we
/// assume that all operands of an op must be vectorized but this is not		/// assume that all operands of an op must be vectorized but this is not
/// always true. In the future, it would be nice to have a trait that		/// always true. In the future, it would be nice to have a trait that
/// describes how a particular operation vectorizes. For now we implement the		/// describes how a particular operation vectorizes. For now we implement the
/// case distinction here. Returns a vectorized form of an operation or		/// case distinction here. Returns a vectorized form of an operation or
▲ Show 20 Lines • Show All 86 Lines • ▼ Show 20 Lines	vectorizeLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
// vectorization succeeds, the scalar loop nest is erased. If vectorization		// vectorization succeeds, the scalar loop nest is erased. If vectorization
// fails, the vector loop nest is erased and the scalar loop nest is not		// fails, the vector loop nest is erased and the scalar loop nest is not
// modified.		// modified.
//////////////////////////////////////////////////////////////////////////////		//////////////////////////////////////////////////////////////////////////////

auto opVecResult = rootLoop.walk<WalkOrder::PreOrder>([&](Operation *op) {		auto opVecResult = rootLoop.walk<WalkOrder::PreOrder>([&](Operation *op) {
LLVM_DEBUG(dbgs() << "[early-vect]+++++ Vectorizing: " << *op);		LLVM_DEBUG(dbgs() << "[early-vect]+++++ Vectorizing: " << *op);
Operation *vectorOp = vectorizeOneOperation(op, state);		Operation *vectorOp = vectorizeOneOperation(op, state);
if (!vectorOp)		if (!vectorOp) {
		LLVM_DEBUG(
		dbgs() << "[early-vect]+++++ failed vectorizing the operation: "
		<< *op << "\n");
return WalkResult::interrupt();		return WalkResult::interrupt();
		}

return WalkResult::advance();		return WalkResult::advance();
});		});

if (opVecResult.wasInterrupted()) {		if (opVecResult.wasInterrupted()) {
LLVM_DEBUG(dbgs() << "[early-vect]+++++ failed vectorization for: "		LLVM_DEBUG(dbgs() << "[early-vect]+++++ failed vectorization for: "
<< rootLoop << "\n");		<< rootLoop << "\n");
// Erase vector loop nest if it was created.		// Erase vector loop nest if it was created.
auto vecRootLoopIt = state.opVectorReplacement.find(rootLoop);		auto vecRootLoopIt = state.opVectorReplacement.find(rootLoop);
if (vecRootLoopIt != state.opVectorReplacement.end())		if (vecRootLoopIt != state.opVectorReplacement.end())
eraseLoopNest(cast<AffineForOp>(vecRootLoopIt->second));		eraseLoopNest(cast<AffineForOp>(vecRootLoopIt->second));

return failure();		return failure();
}		}

		// Replace results of reduction loops with the scalar values computed using
		// `vector.reduce` or similar ops.
		for (auto resPair : state.loopResultScalarReplacement)
		resPair.first.replaceAllUsesWith(resPair.second);

assert(state.opVectorReplacement.count(rootLoop) == 1 &&		assert(state.opVectorReplacement.count(rootLoop) == 1 &&
"Expected vector replacement for loop nest");		"Expected vector replacement for loop nest");
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");		LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorization result:\n"		LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorization result:\n"
<< *state.opVectorReplacement[rootLoop]);		<< *state.opVectorReplacement[rootLoop]);

// Finish this vectorization pattern.		// Finish this vectorization pattern.
state.finishVectorizationPattern(rootLoop);		state.finishVectorizationPattern(rootLoop);
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines

/// Internal implementation to vectorize affine loops in 'loops' using the n-D		/// Internal implementation to vectorize affine loops in 'loops' using the n-D
/// vectorization factors in 'vectorSizes'. By default, each vectorization		/// vectorization factors in 'vectorSizes'. By default, each vectorization
/// factor is applied inner-to-outer to the loops of each loop nest.		/// factor is applied inner-to-outer to the loops of each loop nest.
/// 'fastestVaryingPattern' can be optionally used to provide a different loop		/// 'fastestVaryingPattern' can be optionally used to provide a different loop
/// vectorization order.		/// vectorization order.
static void vectorizeLoops(Operation parentOp, DenseSet<Operation > &loops,		static void vectorizeLoops(Operation parentOp, DenseSet<Operation > &loops,
ArrayRef<int64_t> vectorSizes,		ArrayRef<int64_t> vectorSizes,
ArrayRef<int64_t> fastestVaryingPattern) {		ArrayRef<int64_t> fastestVaryingPattern,
		const ReductionRecognizer *reductionRecognizer) {
		assert((!reductionRecognizer \|\| vectorSizes.size() == 1) &&
		"Vectorizing reductions is supported only for 1-D vectors");

// Compute 1-D, 2-D or 3-D loop pattern to be matched on the target loops.		// Compute 1-D, 2-D or 3-D loop pattern to be matched on the target loops.
Optional<NestedPattern> pattern =		Optional<NestedPattern> pattern =
makePattern(loops, vectorSizes.size(), fastestVaryingPattern);		makePattern(loops, vectorSizes.size(), fastestVaryingPattern);
if (!pattern.hasValue()) {		if (!pattern.hasValue()) {
LLVM_DEBUG(dbgs() << "\n[early-vect] pattern couldn't be computed\n");		LLVM_DEBUG(dbgs() << "\n[early-vect] pattern couldn't be computed\n");
return;		return;
}		}

Show All 14 Lines	static void vectorizeLoops(Operation parentOp, DenseSet<Operation > &loops,
// Iterate over all buckets and vectorize the matches eagerly. We can only		// Iterate over all buckets and vectorize the matches eagerly. We can only
// vectorize one match from each bucket since all the matches within a bucket		// vectorize one match from each bucket since all the matches within a bucket
// intersect.		// intersect.
for (auto &intersectingMatches : intersectionBuckets) {		for (auto &intersectingMatches : intersectionBuckets) {
for (NestedMatch &match : intersectingMatches) {		for (NestedMatch &match : intersectingMatches) {
VectorizationStrategy strategy;		VectorizationStrategy strategy;
// TODO: depending on profitability, elect to reduce the vector size.		// TODO: depending on profitability, elect to reduce the vector size.
strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end());		strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end());
		strategy.reductionRecognizer = reductionRecognizer;
if (failed(analyzeProfitability(match.getMatchedChildren(), 1,		if (failed(analyzeProfitability(match.getMatchedChildren(), 1,
patternDepth, &strategy))) {		patternDepth, &strategy))) {
continue;		continue;
}		}
vectorizeLoopIfProfitable(match.getMatchedOperation(), 0, patternDepth,		vectorizeLoopIfProfitable(match.getMatchedOperation(), 0, patternDepth,
&strategy);		&strategy);
// Vectorize match. Skip the rest of intersecting matches in the bucket if		// Vectorize match. Skip the rest of intersecting matches in the bucket if
// vectorization succeeded.		// vectorization succeeded.
Show All 21 Lines	void Vectorize::runOnFunction() {
FuncOp f = getFunction();		FuncOp f = getFunction();
if (!fastestVaryingPattern.empty() &&		if (!fastestVaryingPattern.empty() &&
fastestVaryingPattern.size() != vectorSizes.size()) {		fastestVaryingPattern.size() != vectorSizes.size()) {
f.emitRemark("Fastest varying pattern specified with different size than "		f.emitRemark("Fastest varying pattern specified with different size than "
"the vector size.");		"the vector size.");
return signalPassFailure();		return signalPassFailure();
}		}

		if (vectorizeReductions && vectorSizes.size() != 1) {
		f.emitError("Vectorizing reductions is supported only for 1-D vectors.");
		return signalPassFailure();
		}

		// If 'vectorize-reduction=true' is provided, use the standard reduction
		// recognizer, otherwise use the null reduction recognizer which rejects all
		// reductions.
		const ReductionRecognizer &stdRedRecognizer = StandardReductionRecognizer();
		const ReductionRecognizer &nullRedRecognizer = NullReductionRecognizer();
		const ReductionRecognizer &reductionRecognizer =
		vectorizeReductions ? stdRedRecognizer : nullRedRecognizer;

DenseSet<Operation *> parallelLoops;		DenseSet<Operation *> parallelLoops;
f.walk([&parallelLoops](AffineForOp loop) {		f.walk([&parallelLoops, &reductionRecognizer](AffineForOp loop) {
if (isLoopParallel(loop))		if (isParallelReductionLoop(loop, reductionRecognizer))
parallelLoops.insert(loop);		parallelLoops.insert(loop);
});		});

// Thread-safe RAII local context, BumpPtrAllocator freed on exit.		// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
NestedPatternContext mlContext;		NestedPatternContext mlContext;
vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);		vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern,
		vectorizeReductions ? &reductionRecognizer : nullptr);
}		}

/// Verify that affine loops in 'loops' meet the nesting criteria expected by		/// Verify that affine loops in 'loops' meet the nesting criteria expected by
/// SuperVectorizer:		/// SuperVectorizer:
/// * There must be at least one loop.		/// * There must be at least one loop.
/// * There must be a single root loop (nesting level 0).		/// * There must be a single root loop (nesting level 0).
/// * Each loop at a given nesting level must be nested in a loop from a		/// * Each loop at a given nesting level must be nested in a loop from a
/// previous nesting level.		/// previous nesting level.
Show All 27 Lines	verifyLoopNesting(const std::vector<SmallVector<AffineForOp, 2>> &loops) {
}		}

return success();		return success();
}		}

namespace mlir {		namespace mlir {

/// External utility to vectorize affine loops in 'loops' using the n-D		/// External utility to vectorize affine loops in 'loops' using the n-D
/// vectorization factors in 'vectorSizes'. By default, each vectorization		/// vectorization factors in 'vectorSizes'. By default, each vectorization
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions Nice! This is completely unrelated to affine though and could help other places (e.g. the Linalg vectorizer). Can you please move this to a dialect-independent/std-dialect utils ? nicolasvasilache: Nice! This is completely unrelated to affine though and could help other places (e.g. the…
		sgrechanikAuthorUnsubmitted Done Reply Inline Actions Yeah, I'll try to move it into some more publicly accessible place. sgrechanik: Yeah, I'll try to move it into some more publicly accessible place.
/// factor is applied inner-to-outer to the loops of each loop nest.		/// factor is applied inner-to-outer to the loops of each loop nest.
/// 'fastestVaryingPattern' can be optionally used to provide a different loop		/// 'fastestVaryingPattern' can be optionally used to provide a different loop
/// vectorization order.		/// vectorization order.
		/// If `reductionRecognizer` is not null, recognized reduction loops may be
		/// vectorized along the reduction dimension.
		/// TODO: Vectorizing reductions is supported only for 1-D vectorization.
void vectorizeAffineLoops(Operation parentOp, DenseSet<Operation > &loops,		void vectorizeAffineLoops(Operation parentOp, DenseSet<Operation > &loops,
ArrayRef<int64_t> vectorSizes,		ArrayRef<int64_t> vectorSizes,
ArrayRef<int64_t> fastestVaryingPattern) {		ArrayRef<int64_t> fastestVaryingPattern,
		const ReductionRecognizer *reductionRecognizer) {
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.		// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
NestedPatternContext mlContext;		NestedPatternContext mlContext;
vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern);		vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern,
		reductionRecognizer);
}		}

/// External utility to vectorize affine loops from a single loop nest using an		/// External utility to vectorize affine loops from a single loop nest using an
/// n-D vectorization strategy (see doc in VectorizationStrategy definition).		/// n-D vectorization strategy (see doc in VectorizationStrategy definition).
/// Loops are provided in a 2D vector container. The first dimension represents		/// Loops are provided in a 2D vector container. The first dimension represents
/// the nesting level relative to the loops to be vectorized. The second		/// the nesting level relative to the loops to be vectorized. The second
/// dimension contains the loops. This means that:		/// dimension contains the loops. This means that:
/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',		/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',
▲ Show 20 Lines • Show All 48 Lines • Show Last 20 Lines

mlir/lib/Transforms/Utils/CMakeLists.txt

	add_mlir_library(MLIRTransformUtils			add_mlir_library(MLIRTransformUtils
	DialectConversion.cpp			DialectConversion.cpp
	FoldUtils.cpp			FoldUtils.cpp
	GreedyPatternRewriteDriver.cpp			GreedyPatternRewriteDriver.cpp
	InliningUtils.cpp			InliningUtils.cpp
	LoopFusionUtils.cpp			LoopFusionUtils.cpp
	LoopUtils.cpp			LoopUtils.cpp
				ReductionUtils.cpp
	RegionUtils.cpp			RegionUtils.cpp
	Utils.cpp			Utils.cpp

	ADDITIONAL_HEADER_DIRS			ADDITIONAL_HEADER_DIRS
	${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms			${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms

	DEPENDS			DEPENDS
	MLIRStandardOpsIncGen			MLIRStandardOpsIncGen

	LINK_LIBS PUBLIC			LINK_LIBS PUBLIC
	MLIRAffine			MLIRAffine
	MLIRAnalysis			MLIRAnalysis
	MLIRLoopAnalysis			MLIRLoopAnalysis
	MLIRMemRef			MLIRMemRef
	MLIRSCF			MLIRSCF
	MLIRPass			MLIRPass
	MLIRRewrite			MLIRRewrite
	MLIRStandard			MLIRStandard
				MLIRVector
	)			)

mlir/lib/Transforms/Utils/ReductionUtils.cpp

This file was added.

				//===- ReductionUtils.cpp - Reduction-related utilities ---------- C++ --===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This file implements utilities for recognizing and manipulating reductions.
				//
				//===----------------------------------------------------------------------===//

				#include "mlir/Transforms/ReductionUtils.h"
				#include "mlir/Analysis/Utils.h"
				#include "mlir/Dialect/StandardOps/IR/Ops.h"
				#include "mlir/Dialect/Vector/VectorOps.h"
				#include "mlir/IR/BuiltinTypes.h"
				#include "llvm/Support/Debug.h"

				#define DEBUG_TYPE "reduction-utils"

				using namespace mlir;
				using llvm::dbgs;

				/// Builds a reduction of the vector value into a scalar.
				template <class Op>
				Value StandardReductionInfo<Op>::createVectorReduction(
				Value vector, OpBuilder &builder) const {
				Type scalarType = vector.getType().cast<ShapedType>().getElementType();
				return builder.create<vector::ReductionOp>(vector.getLoc(), scalarType,
				builder.getStringAttr(kindString),
				vector, ValueRange{});
				}

				/// Combines two scalar values.
				template <class Op>
				Value StandardReductionInfo<Op>::combine(Value lhs, Value rhs,
				OpBuilder &builder) const {
				return builder.create<Op>(lhs.getLoc(), lhs, rhs);
				}

				/// Returns an instance of this class.
				template <class Op>
				ReductionInfo *StandardReductionInfo<Op>::get() {
				static StandardReductionInfo<Op> reduction;
				return &reduction;
				}

				/// The string used as the kind for `vector.reduction`.
				template <>
				const char *const StandardReductionInfo<AddIOp>::kindString = "add";
				template <>
				const char *const StandardReductionInfo<AddFOp>::kindString = "add";
				template <>
				const char *const StandardReductionInfo<MulIOp>::kindString = "mul";
				template <>
				const char *const StandardReductionInfo<MulFOp>::kindString = "mul";

				template <>
				Attribute
				StandardReductionInfo<AddIOp>::getNeutralElementAttr(Type elemType,
				OpBuilder &builder) const {
				return builder.getZeroAttr(elemType);
				}

				template <>
				Attribute
				StandardReductionInfo<AddFOp>::getNeutralElementAttr(Type elemType,
				OpBuilder &builder) const {
				return builder.getZeroAttr(elemType);
				}

				template <>
				Attribute
				StandardReductionInfo<MulIOp>::getNeutralElementAttr(Type elemType,
				OpBuilder &builder) const {
				return builder.getIntegerAttr(elemType, 1);
				}

				template <>
				Attribute
				StandardReductionInfo<MulFOp>::getNeutralElementAttr(Type elemType,
				OpBuilder &builder) const {
				return builder.getFloatAttr(elemType, 1);
				}

				/// Recognize a standard reduction given the scalar value from the previous
				/// iteration `arg` and the value passed to the next iteration `yielded`.
				/// Example:
				///
				/// %sum = affine.for %j = 0 to 512 iter_args(%arg = %cst0) -> (f32) {
				/// %ld = affine.load %in[%j] : memref<512xf32>
				/// %yielded = addf %arg, %ld : f32
				/// affine.yield %yielded : f32
				/// }
				///
				const ReductionInfo *
				StandardReductionRecognizer::recognize(BlockArgument arg, Value yielded) const {
				// Check that there is a single combining operation that doesn't leak
				// information. This should be suitable for most kinds of reductions, except
				// for min and max that need two operations (compare and select).
				Operation *combiner = getSingleOpCombiner(arg, yielded);
				if (!combiner) {
				LLVM_DEBUG(dbgs() << "Reduction not recognized, the combiner doesn't use "
				"`arg` or information is leaked:"
				<< "\narg: " << arg << "\nyielded value: " << yielded);
				return nullptr;
				}

				const ReductionInfo *reductionInfo = recognizeSingleOpCombiner(arg, yielded);
				if (!reductionInfo)
				LLVM_DEBUG(dbgs() << "Reduction not recognized, unknown op: " << *combiner);

				return reductionInfo;
				}

				/// Checks if `arg` is used exclusively by the operation generating the yielded
				/// value. On success the combining operation is returned.
				Operation *
				StandardReductionRecognizer::getSingleOpCombiner(BlockArgument arg,
				Value yielded) const {
				// This is the combining operation like addf or muli.
				Operation *combiner = yielded.getDefiningOp();
				if (!combiner)
				return nullptr;
				// If `arg` or `yield` are used elsewhere then we may leak intermediate values
				// (like partial sums) making the reduction loop unvectorizable and
				// unparallelizable (with simple methods at least).
				if (!llvm::hasSingleElement(arg.getUses()) \|\|
				!llvm::hasSingleElement(yielded.getUses()))
				return nullptr;
				// The only user of `arg` must be the combining operation.
				if (*std::begin(arg.getUsers()) != combiner)
				return nullptr;
				return combiner;
				}

				/// Recognizes the simplest case when the reduction uses one operation.
				const ReductionInfo *
				StandardReductionRecognizer::recognizeSingleOpCombiner(BlockArgument arg,
				Value yielded) const {
				Operation *op = yielded.getDefiningOp();
				if (isa<AddFOp>(op))
				return StandardReductionInfo<AddFOp>::get();
				if (isa<AddIOp>(op))
				return StandardReductionInfo<AddIOp>::get();
				if (isa<MulFOp>(op))
				return StandardReductionInfo<MulFOp>::get();
				if (isa<MulIOp>(op))
				return StandardReductionInfo<MulIOp>::get();
				return nullptr;
				}

				/// Returns true if `forOp` is a parallel loop possibly implementing known
				/// reductions via loop-carried variables (iter_args). Reductions are considered
				/// known (and parallel) if they are recognized by `reductionRecognizer`.
				bool mlir::isParallelReductionLoop(
				AffineForOp forOp, const ReductionRecognizer &reductionRecognizer) {
				// Check that there are no loop-carried memory dependences.
				if (!isLoopParallel(forOp, /ignoreIterArgs=/true))
				return false;

				// Check that all iteration arguments implement known reductions.
				auto iterArgs = forOp.getRegionIterArgs();
				auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
				for (auto argAndYield : llvm::zip(iterArgs, yieldedValues)) {
				const ReductionInfo *reduction = reductionRecognizer.recognize(
				std::get<0>(argAndYield), std::get<1>(argAndYield));
				if (!reduction)
				return false;
				}

				return true;
				}

				/// Populates `reductions` with the information about known reductions
				/// implemented by `forOp`. Reductions are considered known if they are
				/// recognized by `reductionRecognizer`. Returns `true` if all iteration
				/// variables implement recognizable reductions and `false` otherwise.
				bool mlir::getKnownReductions(
				AffineForOp forOp, const ReductionRecognizer &reductionRecognizer,
				SmallVectorImpl<const ReductionInfo *> &reductions) {
				bool allKnown = true;
				auto iterArgs = forOp.getRegionIterArgs();
				auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
				for (auto argAndYield : llvm::zip(iterArgs, yieldedValues)) {
				const ReductionInfo *reduction = reductionRecognizer.recognize(
				std::get<0>(argAndYield), std::get<1>(argAndYield));
				if (!reduction)
				allKnown = false;
				reductions.push_back(reduction);
				}

				return allKnown;
				}

mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir

	Show First 20 Lines • Show All 584 Lines • ▼ Show 20 Lines
	// CHECK: affine.for %{{.*}} = 0 to 256 step 128 {			// CHECK: affine.for %{{.*}} = 0 to 256 step 128 {
	// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>			// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
	// CHECK: %[[last_val:.]] = affine.for %{{.}} = 0 to 128 iter_args(%[[last_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {			// CHECK: %[[last_val:.]] = affine.for %{{.}} = 0 to 128 iter_args(%[[last_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
	// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<128x256xf32>, vector<128xf32>			// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<128x256xf32>, vector<128xf32>
	// CHECK: affine.yield %[[ld]] : vector<128xf32>			// CHECK: affine.yield %[[ld]] : vector<128xf32>
	// CHECK: }			// CHECK: }
	// CHECK: vector.transfer_write %[[last_val]], %{{.*}} : vector<128xf32>, memref<256xf32>			// CHECK: vector.transfer_write %[[last_val]], %{{.*}} : vector<128xf32>, memref<256xf32>
	// CHECK: }			// CHECK: }

				// -----

				// The inner reduction loop '%j' is not vectorized if we do not request
				// reduction vectorization.

				func @vec_vecdim_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %red_iter, %ld : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @vec_vecdim_reduction_rejected
				// CHECK-NOT: vector

mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir

This file was added.

				// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0 vectorize-reductions=true" -split-input-file \| FileCheck %s

				// The inner reduction loop '%j' is vectorized.

				func @vecdim_reduction(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %red_iter, %ld : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @vecdim_reduction
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[vred:.]] = affine.for %{{.}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
				// CHECK: affine.yield %[[add]] : vector<128xf32>
				// CHECK: }
				// CHECK: %[[final_sum:.]] = vector.reduction "add", %[[vred:.]] : vector<128xf32> into f32
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
				// CHECK: }

				// -----

				// The inner reduction loop '%j' is vectorized. (The order of addf's operands is
				// different than in the previous test case).

				func @vecdim_reduction_comm(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %ld, %red_iter : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @vecdim_reduction_comm
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[vred:.]] = affine.for %{{.}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[ld]], %[[red_iter]] : vector<128xf32>
				// CHECK: affine.yield %[[add]] : vector<128xf32>
				// CHECK: }
				// CHECK: %[[final_sum:.]] = vector.reduction "add", %[[vred:.]] : vector<128xf32> into f32
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
				// CHECK: }

				// -----

				// The inner reduction loop '%j' is vectorized. Transforming the input before
				// performing the accumulation doesn't cause any problem.

				func @vecdim_reduction_expsin(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%sin = math.sin %ld : f32
				%exp = math.exp %sin : f32
				%add = addf %red_iter, %exp : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @vecdim_reduction_expsin
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[vred:.]] = affine.for %{{.}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[sin:.*]] = math.sin %[[ld]]
				// CHECK: %[[exp:.*]] = math.exp %[[sin]]
				// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[exp]] : vector<128xf32>
				// CHECK: affine.yield %[[add]] : vector<128xf32>
				// CHECK: }
				// CHECK: %[[final_sum:.]] = vector.reduction "add", %[[vred:.]] : vector<128xf32> into f32
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
				// CHECK: }

				// -----

				// Two reductions at the same time. The inner reduction loop '%j' is vectorized.

				func @two_vecdim_reductions(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>) {
				%cst = constant 1.000000e+00 : f32
				affine.for %i = 0 to 256 {
				// Note that we pass the same constant '1.0' as initial values for both
				// reductions.
				%sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %part_sum, %ld : f32
				%mul = mulf %part_prod, %ld : f32
				affine.yield %add, %mul : f32, f32
				}
				affine.store %sum, %out_sum[%i] : memref<256xf32>
				affine.store %prod, %out_prod[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @two_vecdim_reductions
				// CHECK: %[[cst:.*]] = constant 1.000000e+00 : f32
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[vone:.*]] = constant dense<1.000000e+00> : vector<128xf32>
				// CHECK: %[[vred:.]]:2 = affine.for %{{.}} = 0 to 512 step 128 iter_args(%[[part_sum:.]] = %[[vzero]], %[[part_prod:.]] = %[[vone]]) -> (vector<128xf32>, vector<128xf32>) {
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[part_sum]], %[[ld]] : vector<128xf32>
				// CHECK: %[[mul:.*]] = mulf %[[part_prod]], %[[ld]] : vector<128xf32>
				// CHECK: affine.yield %[[add]], %[[mul]] : vector<128xf32>, vector<128xf32>
				// CHECK: }
				// CHECK: %[[nonfinal_sum:.]] = vector.reduction "add", %[[vred:.]]#0 : vector<128xf32> into f32
				// Note that to compute the final sum we need to add the original initial value
				// (%cst) since it is not zero.
				// CHECK: %[[final_sum:.*]] = addf %[[nonfinal_sum]], %[[cst]] : f32
				// For the final product we don't need to do this additional step because the
				// initial value equals to 1 (the neutral element for multiplication).
				// CHECK: %[[final_prod:.]] = vector.reduction "mul", %[[vred:.]]#1 : vector<128xf32> into f32
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
				// CHECK: affine.store %[[final_prod]], %{{.*}} : memref<256xf32>
				// CHECK: }

				// -----

				// The integer case.

				func @two_vecdim_reductions_int(%in: memref<256x512xi64>, %out_sum: memref<256xi64>, %out_prod: memref<256xi64>) {
				%cst0 = constant 0 : i64
				%cst1 = constant 1 : i64
				affine.for %i = 0 to 256 {
				%sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst0, %part_prod = %cst1) -> (i64, i64) {
				%ld = affine.load %in[%i, %j] : memref<256x512xi64>
				%add = addi %part_sum, %ld : i64
				%mul = muli %part_prod, %ld : i64
				affine.yield %add, %mul : i64, i64
				}
				affine.store %sum, %out_sum[%i] : memref<256xi64>
				affine.store %prod, %out_prod[%i] : memref<256xi64>
				}
				return
				}

				// CHECK-LABEL: @two_vecdim_reductions
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %[[vzero:.*]] = constant dense<0> : vector<128xi64>
				// CHECK: %[[vone:.*]] = constant dense<1> : vector<128xi64>
				// CHECK: %[[vred:.]]:2 = affine.for %{{.}} = 0 to 512 step 128 iter_args(%[[part_sum:.]] = %[[vzero]], %[[part_prod:.]] = %[[vone]]) -> (vector<128xi64>, vector<128xi64>) {
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xi64>, vector<128xi64>
				// CHECK: %[[add:.*]] = addi %[[part_sum]], %[[ld]] : vector<128xi64>
				// CHECK: %[[mul:.*]] = muli %[[part_prod]], %[[ld]] : vector<128xi64>
				// CHECK: affine.yield %[[add]], %[[mul]] : vector<128xi64>, vector<128xi64>
				// CHECK: }
				// CHECK: %[[final_sum:.]] = vector.reduction "add", %[[vred:.]]#0 : vector<128xi64> into i64
				// CHECK: %[[final_prod:.]] = vector.reduction "mul", %[[vred:.]]#1 : vector<128xi64> into i64
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xi64>
				// CHECK: affine.store %[[final_prod]], %{{.*}} : memref<256xi64>
				// CHECK: }

				// -----

				// The outer reduction loop '%j' is vectorized.

				func @vecdim_reduction_nested(%in: memref<256x512xf32>, %out: memref<1xf32>) {
				%cst = constant 0.000000e+00 : f32
				%outer_red = affine.for %j = 0 to 512 iter_args(%outer_iter = %cst) -> (f32) {
				%inner_red = affine.for %i = 0 to 256 iter_args(%inner_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %inner_iter, %ld : f32
				affine.yield %add : f32
				}
				%outer_add = addf %outer_iter, %inner_red : f32
				affine.yield %outer_add : f32
				}
				affine.store %outer_red, %out[0] : memref<1xf32>
				return
				}

				// CHECK-LABEL: @vecdim_reduction_nested
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[outer_red:.]] = affine.for %{{.}} = 0 to 512 step 128 iter_args(%[[outer_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[inner_red:.]] = affine.for %{{.}} = 0 to 256 iter_args(%[[inner_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[inner_iter]], %[[ld]] : vector<128xf32>
				// CHECK: affine.yield %[[add]] : vector<128xf32>
				// CHECK: }
				// CHECK: %[[outer_add:.*]] = addf %[[outer_iter]], %[[inner_red]] : vector<128xf32>
				// CHECK: affine.yield %[[outer_add]] : vector<128xf32>
				// CHECK: }
				// CHECK: %[[final_sum:.]] = vector.reduction "add", %[[outer_red:.]] : vector<128xf32> into f32
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<1xf32>

				// -----

				// The inner reduction loop '%j' computes partial sums as a side effect and
				// is not vectorized.

				func @vecdim_partial_sums_1_rejected(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>, %out_partsum: memref<256x512xf32>) {
				%cst = constant 1.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %part_sum, %ld : f32
				%mul = mulf %part_prod, %ld : f32
				affine.store %add, %out_partsum[%i, %j] : memref<256x512xf32>
				affine.yield %add, %mul : f32, f32
				}
				affine.store %sum, %out_sum[%i] : memref<256xf32>
				affine.store %prod, %out_prod[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @vecdim_partial_sums_1_rejected
				// CHECK-NOT: vector

				// -----

				// The inner reduction loop '%j' computes partial sums as a side effect and
				// is not vectorized.

				func @vecdim_partial_sums_2_rejected(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>, %out_partsum: memref<256x512xf32>) {
				%cst = constant 1.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
				affine.store %part_sum, %out_partsum[%i, %j] : memref<256x512xf32>
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %part_sum, %ld : f32
				%mul = mulf %part_prod, %ld : f32
				affine.yield %add, %mul : f32, f32
				}
				affine.store %sum, %out_sum[%i] : memref<256xf32>
				affine.store %prod, %out_prod[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @vecdim_partial_sums_2_rejected
				// CHECK-NOT: vector

				// -----

				// The inner reduction loop '%j' performs an unknown reduction operation and is
				// not vectorized.

				func @vecdim_unknown_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 1.000000e+00 : f32
				%final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
				%add = addf %red_iter, %red_iter : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[0] : memref<256xf32>
				return
				}

				// CHECK-LABEL: @vecdim_unknown_reduction_rejected
				// CHECK-NOT: vector

				// -----

				// The inner reduction loop '%j' doesn't perform any operation which is not
				// recognized as a standard reduction.

				func @vecdim_none_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 1.000000e+00 : f32
				%final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
				affine.yield %red_iter : f32
				}
				affine.store %final_red, %out[0] : memref<256xf32>
				return
				}

				// CHECK-LABEL: @vecdim_none_reduction_rejected
				// CHECK-NOT: vector

				// -----

				// The number of iterations is not divisable by the vector size, so a mask has
				// to be applied to the last update of the accumulator.

				func @vecdim_reduction_masked(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 0 to 500 iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %red_iter, %ld : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK: #[[$map0:.]] = affine_map<([[d0:.]]) -> (-[[d0]] + 500)>
				// CHECK-LABEL: @vecdim_reduction_masked
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[vred:.]] = affine.for %[[iv:.]] = 0 to 500 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
				// CHECK: %[[elems_left:.*]] = affine.apply #[[$map0]](%[[iv]])
				// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
				// CHECK: %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
				// CHECK: affine.yield %[[new_acc]] : vector<128xf32>
				// CHECK: }
				// CHECK: %[[final_sum:.]] = vector.reduction "add", %[[vred:.]] : vector<128xf32> into f32
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
				// CHECK: }

				// -----

				// The number of iteration is not known, so a mask has to be applied.

				func @vecdim_reduction_masked_unknown_ub(%in: memref<256x512xf32>, %out: memref<256xf32>, %bnd: index) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 0 to %bnd iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %red_iter, %ld : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK: #[[$map1:.]] = affine_map<([[d0:.]]){{\[}}[[s0:.*]]{{\]}} -> (-[[d0]] + [[s0]])>
				// CHECK-LABEL: @vecdim_reduction_masked_unknown_ub
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[vred:.]] = affine.for %[[iv:.]] = 0 to %[[bnd:.]] step 128 iter_args(%[[red_iter:.]] = %[[vzero]]) -> (vector<128xf32>) {
				// CHECK: %[[elems_left:.*]] = affine.apply #[[$map1]](%[[iv]])[%[[bnd]]]
				// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
				// CHECK: %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
				// CHECK: affine.yield %[[new_acc]] : vector<128xf32>
				// CHECK: }
				// CHECK: %[[final_sum:.]] = vector.reduction "add", %[[vred:.]] : vector<128xf32> into f32
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
				// CHECK: }

				// -----

				// The lower bound is nonzero, but the number of iterations is divisible by the
				// vector size, so masking is not needed.

				func @vecdim_reduction_nonzero_lb(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 127 to 511 iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %red_iter, %ld : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @vecdim_reduction_nonzero_lb
				// CHECK: %{{.}} = affine.for %{{.}} = 127 to 511 step 128 iter_args({{.*}}) -> (vector<128xf32>) {
				// CHECK-NOT: vector.create_mask

				// -----

				// The lower bound is unknown, so we need to create a mask.

				func @vecdim_reduction_masked_unknown_lb(%in: memref<256x512xf32>, %out: memref<256xf32>, %lb: index) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = %lb to 512 iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %red_iter, %ld : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK: #[[$map2:.]] = affine_map<([[d0:.]]) -> (-[[d0]] + 512)>
				// CHECK-LABEL: @vecdim_reduction_masked_unknown_lb
				// CHECK: %{{.}} = affine.for %[[iv:.]] = %[[lb:.]] to 512 step 128 iter_args(%[[red_iter:.]] = {{.*}}) -> (vector<128xf32>) {
				// CHECK: %[[elems_left:.*]] = affine.apply #[[$map2]](%[[iv]])
				// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
				// CHECK: %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
				// CHECK: affine.yield %[[new_acc]] : vector<128xf32>

				// -----

				// The upper bound is a minimum expression.

				func @vecdim_reduction_complex_ub(%in: memref<256x512xf32>, %out: memref<256xf32>, %M: index, %N: index) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 0 to min affine_map<(d0, d1) -> (d0, d1*2)>(%M, %N) iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %red_iter, %ld : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK: #[[$map3:.]] = affine_map<([[d0:.]], [[d1:.]]) -> ([[d0]], [[d1]] 2)>
				// CHECK: #[[$map3_sub:.]] = affine_map<([[d0:.]], [[d1:.*]]) -> ([[d0]] - [[d1]])>
				// CHECK-LABEL: @vecdim_reduction_complex_ub
				// CHECK: %{{.}} = affine.for %[[iv:.]] = 0 to min #[[$map3]](%[[M:.]], %[[N:.]]) step 128 iter_args(%[[red_iter:.]] = {{.}}) -> (vector<128xf32>) {
				// CHECK: %[[ub:.*]] = affine.min #[[$map3]](%[[M]], %[[N]])
				// CHECK: %[[elems_left:.*]] = affine.apply #[[$map3_sub]](%[[ub]], %[[iv]])
				// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
				// CHECK: %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
				// CHECK: affine.yield %[[new_acc]] : vector<128xf32>

				// -----

				// The same mask is applied to both reductions.

				func @vecdim_two_reductions_masked(%in: memref<256x512xf32>, %out: memref<512xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_sum, %final_expsum = affine.for %j = 0 to 500 iter_args(%sum_iter = %cst, %expsum_iter = %cst) -> (f32, f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%exp = math.exp %ld : f32
				%add = addf %sum_iter, %ld : f32
				%eadd = addf %expsum_iter, %exp : f32
				affine.yield %add, %eadd : f32, f32
				}
				affine.store %final_sum, %out[2*%i] : memref<512xf32>
				affine.store %final_expsum, %out[2*%i + 1] : memref<512xf32>
				}
				return
				}

				// CHECK: #[[$map4:.]] = affine_map<([[d0:.]]) -> (-[[d0]] + 500)>
				// CHECK-LABEL: @vecdim_two_reductions_masked
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %{{.}} = affine.for %[[iv:.]] = 0 to 500 step 128 iter_args(%[[sum_iter:.]] = {{.}}, %[[esum_iter:.]] = {{.}}) -> (vector<128xf32>, vector<128xf32>) {
				// CHECK: %[[elems_left:.*]] = affine.apply #[[$map4]](%[[iv]])
				// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[exp:.*]] = math.exp %[[ld]] : vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[sum_iter]], %[[ld]] : vector<128xf32>
				// CHECK: %[[eadd:.*]] = addf %[[esum_iter]], %[[exp]] : vector<128xf32>
				// CHECK: %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[sum_iter]] : vector<128xi1>, vector<128xf32>
				// CHECK: %[[new_eacc:.*]] = select %[[mask]], %[[eadd]], %[[esum_iter]] : vector<128xi1>, vector<128xf32>
				// CHECK: affine.yield %[[new_acc]], %[[new_eacc]] : vector<128xf32>
				// CHECK: }

mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction_2d.mlir

This file was added.

				// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=32,256 test-fastest-varying=1,0 vectorize-reductions=true" -verify-diagnostics

				// TODO: Vectorization of reduction loops along the reduction dimension is not
				// supported for higher-rank vectors yet, so we are just checking that an
				// error message is produced.

				// expected-error@+1 {{Vectorizing reductions is supported only for 1-D vectors}}
				func @vecdim_reduction_2d(%in: memref<256x512x1024xf32>, %out: memref<256xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%sum_j = affine.for %j = 0 to 512 iter_args(%red_iter_j = %cst) -> (f32) {
				%sum_k = affine.for %k = 0 to 1024 iter_args(%red_iter_k = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j, %k] : memref<256x512x1024xf32>
				%add = addf %red_iter_k, %ld : f32
				affine.yield %add : f32
				}
				%add = addf %red_iter_j, %sum_k : f32
				affine.yield %add : f32
				}
				affine.store %sum_j, %out[%i] : memref<256xf32>
				}
				return
				}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][Affine][Vector] Support vectorizing reduction loops
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 339837

mlir/include/mlir/Analysis/Utils.h

mlir/include/mlir/Dialect/Affine/Passes.td

mlir/include/mlir/Dialect/Affine/Utils.h

mlir/include/mlir/Transforms/ReductionUtils.h

mlir/lib/Analysis/Utils.cpp

mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp

mlir/lib/Transforms/Utils/CMakeLists.txt

mlir/lib/Transforms/Utils/ReductionUtils.cpp

mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir

mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir

mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction_2d.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][Affine][Vector] Support vectorizing reduction loopsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 339837

mlir/include/mlir/Analysis/Utils.h

mlir/include/mlir/Dialect/Affine/Passes.td

mlir/include/mlir/Dialect/Affine/Utils.h

mlir/include/mlir/Transforms/ReductionUtils.h

mlir/lib/Analysis/Utils.cpp

mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp

mlir/lib/Transforms/Utils/CMakeLists.txt

mlir/lib/Transforms/Utils/ReductionUtils.cpp

mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir

mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir

mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction_2d.mlir

[mlir][Affine][Vector] Support vectorizing reduction loops
ClosedPublic