Diff 343080

mlir/include/mlir/Analysis/AffineAnalysis.h

	Show First 20 Lines • Show All 41 Lines • ▼ Show 20 Lines

	/// Returns true if `forOp' is a parallel loop. If `parallelReductions` is			/// Returns true if `forOp' is a parallel loop. If `parallelReductions` is
	/// provided, populates it with descriptors of the parallelizable reductions and			/// provided, populates it with descriptors of the parallelizable reductions and
	/// treats them as not preventing parallelization.			/// treats them as not preventing parallelization.
	bool isLoopParallel(			bool isLoopParallel(
	AffineForOp forOp,			AffineForOp forOp,
	SmallVectorImpl<LoopReduction> *parallelReductions = nullptr);			SmallVectorImpl<LoopReduction> *parallelReductions = nullptr);

				/// Returns true if `forOp' doesn't have memory dependences preventing
				/// parallelization. This function doesn't check iter_args and should be used
				/// only as a building block for full parallel-checking functions.
				bool isLoopMemoryParallel(AffineForOp forOp);

	/// Returns in `affineApplyOps`, the sequence of those AffineApplyOp			/// Returns in `affineApplyOps`, the sequence of those AffineApplyOp
	/// Operations that are reachable via a search starting from `operands` and			/// Operations that are reachable via a search starting from `operands` and
	/// ending at those operands that are not the result of an AffineApplyOp.			/// ending at those operands that are not the result of an AffineApplyOp.
	void getReachableAffineApplyOps(ArrayRef<Value> operands,			void getReachableAffineApplyOps(ArrayRef<Value> operands,
	SmallVectorImpl<Operation *> &affineApplyOps);			SmallVectorImpl<Operation *> &affineApplyOps);

	/// Builds a system of constraints with dimensional identifiers corresponding to			/// Builds a system of constraints with dimensional identifiers corresponding to
	/// the loop IVs of the forOps and AffineIfOp's operands appearing in			/// the loop IVs of the forOps and AffineIfOp's operands appearing in
	▲ Show 20 Lines • Show All 96 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/Affine/Passes.td

Show First 20 Lines • Show All 106 Lines • ▼ Show 20 Lines	let options = [
// the index represents the loop depth, the value represents the k^th		// the index represents the loop depth, the value represents the k^th
// fastest varying memory dimension.		// fastest varying memory dimension.
// This is voluntarily restrictive and is meant to precisely target a		// This is voluntarily restrictive and is meant to precisely target a
// particular loop/op pair, for testing purposes.		// particular loop/op pair, for testing purposes.
ListOption<"fastestVaryingPattern", "test-fastest-varying", "int64_t",		ListOption<"fastestVaryingPattern", "test-fastest-varying", "int64_t",
"Specify a 1-D, 2-D or 3-D pattern of fastest varying memory "		"Specify a 1-D, 2-D or 3-D pattern of fastest varying memory "
"dimensions to match. See defaultPatterns in Vectorize.cpp for "		"dimensions to match. See defaultPatterns in Vectorize.cpp for "
"a description and examples. This is used for testing purposes",		"a description and examples. This is used for testing purposes",
"llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">		"llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
		Option<"vectorizeReductions", "vectorize-reductions", "bool",
		/default=/"false",
		"Vectorize known reductions expressed via iter_args. "
		"Switched off by default.">
];		];
}		}

def AffineParallelize : FunctionPass<"affine-parallelize"> {		def AffineParallelize : FunctionPass<"affine-parallelize"> {
let summary = "Convert affine.for ops into 1-D affine.parallel";		let summary = "Convert affine.for ops into 1-D affine.parallel";
let constructor = "mlir::createAffineParallelizePass()";		let constructor = "mlir::createAffineParallelizePass()";
let options = [		let options = [
Option<"maxNested", "max-nested", "unsigned", /default=/"-1u",		Option<"maxNested", "max-nested", "unsigned", /default=/"-1u",
Show All 20 Lines

mlir/include/mlir/Dialect/Affine/Utils.h

	//===- Utils.h - Affine dialect utilities ------------------------ C++ --===//			//===- Utils.h - Affine dialect utilities ------------------------ C++ --===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// This header file declares a set of utilities for the affine dialect ops.			// This header file declares a set of utilities for the affine dialect ops.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#ifndef MLIR_DIALECT_AFFINE_UTILS_H			#ifndef MLIR_DIALECT_AFFINE_UTILS_H
	#define MLIR_DIALECT_AFFINE_UTILS_H			#define MLIR_DIALECT_AFFINE_UTILS_H

				#include "mlir/Analysis/AffineAnalysis.h"
	#include "mlir/IR/AffineExpr.h"			#include "mlir/IR/AffineExpr.h"
	#include "mlir/Support/LLVM.h"			#include "mlir/Support/LLVM.h"
	#include "llvm/ADT/DenseMap.h"			#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/SmallVector.h"			#include "llvm/ADT/SmallVector.h"

	namespace mlir {			namespace mlir {
				bondhugulaUnsubmitted Not Done Reply Inline Actions Prune includes please. bondhugula: Prune includes please.

	class AffineForOp;			class AffineForOp;
	class AffineIfOp;			class AffineIfOp;
	class AffineParallelOp;			class AffineParallelOp;
	struct LogicalResult;			struct LogicalResult;
	struct LoopReduction;			struct LoopReduction;
	class Operation;			class Operation;

				using ReductionLoopMap = DenseMap<Operation *, SmallVector<LoopReduction, 2>>;

	/// Replaces parallel affine.for op with 1-d affine.parallel op.			/// Replaces parallel affine.for op with 1-d affine.parallel op.
	/// mlir::isLoopParallel detects the parallel affine.for ops.			/// mlir::isLoopParallel detects the parallel affine.for ops.
	/// Parallelizes the specified reductions. Parallelization will fail in presence			/// Parallelizes the specified reductions. Parallelization will fail in presence
	/// of loop iteration arguments that are not listed in `parallelReductions`.			/// of loop iteration arguments that are not listed in `parallelReductions`.
	/// There is no cost model currently used to drive this parallelization.			/// There is no cost model currently used to drive this parallelization.
	LogicalResult			LogicalResult
	affineParallelize(AffineForOp forOp,			affineParallelize(AffineForOp forOp,
	ArrayRef<LoopReduction> parallelReductions = {});			ArrayRef<LoopReduction> parallelReductions = {});
	Show All 38 Lines
	struct VectorizationStrategy {			struct VectorizationStrategy {
	// Vectorization factors to apply to each target vector dimension.			// Vectorization factors to apply to each target vector dimension.
	// Each factor will be applied to a different loop.			// Each factor will be applied to a different loop.
	SmallVector<int64_t, 8> vectorSizes;			SmallVector<int64_t, 8> vectorSizes;
	// Maps each AffineForOp vectorization candidate with its vector dimension.			// Maps each AffineForOp vectorization candidate with its vector dimension.
	// The candidate will be vectorized using the vectorization factor in			// The candidate will be vectorized using the vectorization factor in
	// 'vectorSizes' for that dimension.			// 'vectorSizes' for that dimension.
	DenseMap<Operation *, unsigned> loopToVectorDim;			DenseMap<Operation *, unsigned> loopToVectorDim;
				// Maps loops that implement vectorizable reductions to the corresponding
				// reduction descriptors.
				ReductionLoopMap reductionLoops;
	};			};

	/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in			/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
	/// 'vectorSizes'. By default, each vectorization factor is applied			/// 'vectorSizes'. By default, each vectorization factor is applied
	/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can			/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
	/// be optionally used to provide a different loop vectorization order.			/// be optionally used to provide a different loop vectorization order.
				/// If `reductionLoops` is not empty, the given reduction loops may be
				/// vectorized along the reduction dimension.
				/// TODO: Vectorizing reductions is supported only for 1-D vectorization.
	void vectorizeAffineLoops(			void vectorizeAffineLoops(
	Operation *parentOp,			Operation *parentOp,
	llvm::DenseSet<Operation , DenseMapInfo<Operation >> &loops,			llvm::DenseSet<Operation , DenseMapInfo<Operation >> &loops,
	ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern);			ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern,
				const ReductionLoopMap &reductionLoops = ReductionLoopMap());

	/// External utility to vectorize affine loops from a single loop nest using an			/// External utility to vectorize affine loops from a single loop nest using an
	/// n-D vectorization strategy (see doc in VectorizationStrategy definition).			/// n-D vectorization strategy (see doc in VectorizationStrategy definition).
	/// Loops are provided in a 2D vector container. The first dimension represents			/// Loops are provided in a 2D vector container. The first dimension represents
	/// the nesting level relative to the loops to be vectorized. The second			/// the nesting level relative to the loops to be vectorized. The second
	/// dimension contains the loops. This means that:			/// dimension contains the loops. This means that:
	/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',			/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',
	/// b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'.			/// b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'.
	▲ Show 20 Lines • Show All 49 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/StandardOps/IR/Ops.h

	Show First 20 Lines • Show All 118 Lines • ▼ Show 20 Lines
	bool applyCmpPredicate(CmpFPredicate predicate, const APFloat &lhs,			bool applyCmpPredicate(CmpFPredicate predicate, const APFloat &lhs,
	const APFloat &rhs);			const APFloat &rhs);

	/// Return true if ofr1 and ofr2 are the same integer constant attribute values			/// Return true if ofr1 and ofr2 are the same integer constant attribute values
	/// or the same SSA value.			/// or the same SSA value.
	/// Ignore integer bitwitdh and type mismatch that come from the fact there is			/// Ignore integer bitwitdh and type mismatch that come from the fact there is
	/// no IndexAttr and that IndexType have no bitwidth.			/// no IndexAttr and that IndexType have no bitwidth.
	bool isEqualConstantIntOrValue(OpFoldResult ofr1, OpFoldResult ofr2);			bool isEqualConstantIntOrValue(OpFoldResult ofr1, OpFoldResult ofr2);

				/// Returns the identity value attribute associated with an AtomicRMWKind op.
				Attribute getIdentityValueAttr(AtomicRMWKind kind, Type resultType,
				OpBuilder &builder, Location loc);

				/// Returns the identity value associated with an AtomicRMWKind op.
				Value getIdentityValue(AtomicRMWKind op, Type resultType, OpBuilder &builder,
				Location loc);

				/// Returns the value obtained by applying the reduction operation kind
				/// associated with a binary AtomicRMWKind op to `lhs` and `rhs`.
				Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
				Value lhs, Value rhs);

	} // end namespace mlir			} // end namespace mlir

	#endif // MLIR_DIALECT_IR_STANDARDOPS_IR_OPS_H			#endif // MLIR_DIALECT_IR_STANDARDOPS_IR_OPS_H

mlir/include/mlir/Dialect/Vector/VectorOps.h

	//===- VectorOps.h - MLIR Vector Dialect Operations -------------- C++ --===//			//===- VectorOps.h - MLIR Vector Dialect Operations -------------- C++ --===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// This file defines the Vector dialect.			// This file defines the Vector dialect.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#ifndef MLIR_DIALECT_VECTOR_VECTOROPS_H			#ifndef MLIR_DIALECT_VECTOR_VECTOROPS_H
	#define MLIR_DIALECT_VECTOR_VECTOROPS_H			#define MLIR_DIALECT_VECTOR_VECTOROPS_H

				#include "mlir/Dialect/StandardOps/IR/Ops.h"
	#include "mlir/IR/AffineMap.h"			#include "mlir/IR/AffineMap.h"
	#include "mlir/IR/Attributes.h"			#include "mlir/IR/Attributes.h"
	#include "mlir/IR/BuiltinTypes.h"			#include "mlir/IR/BuiltinTypes.h"
	#include "mlir/IR/Dialect.h"			#include "mlir/IR/Dialect.h"
	#include "mlir/IR/OpDefinition.h"			#include "mlir/IR/OpDefinition.h"
	#include "mlir/Interfaces/SideEffectInterfaces.h"			#include "mlir/Interfaces/SideEffectInterfaces.h"
	#include "mlir/Interfaces/VectorInterfaces.h"			#include "mlir/Interfaces/VectorInterfaces.h"
	#include "mlir/Interfaces/ViewLikeInterface.h"			#include "mlir/Interfaces/ViewLikeInterface.h"
	▲ Show 20 Lines • Show All 163 Lines • ▼ Show 20 Lines

	/// Returns the integer type required for subscripts in the vector dialect.			/// Returns the integer type required for subscripts in the vector dialect.
	IntegerType getVectorSubscriptType(Builder &builder);			IntegerType getVectorSubscriptType(Builder &builder);

	/// Returns an integer array attribute containing the given values using			/// Returns an integer array attribute containing the given values using
	/// the integer type required for subscripts in the vector dialect.			/// the integer type required for subscripts in the vector dialect.
	ArrayAttr getVectorSubscriptAttr(Builder &b, ArrayRef<int64_t> values);			ArrayAttr getVectorSubscriptAttr(Builder &b, ArrayRef<int64_t> values);

				/// Returns the value obtained by reducing the vector into a scalar using the
				/// operation kind associated with a binary AtomicRMWKind op.
				Value getVectorReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
				Value vector);

	namespace impl {			namespace impl {
	/// Build the default minor identity map suitable for a vector transfer. This			/// Build the default minor identity map suitable for a vector transfer. This
	/// also handles the case memref<... x vector<...>> -> vector<...> in which the			/// also handles the case memref<... x vector<...>> -> vector<...> in which the
	/// rank of the identity map must take the vector element type into account.			/// rank of the identity map must take the vector element type into account.
	AffineMap getTransferMinorIdentityMap(ShapedType shapedType,			AffineMap getTransferMinorIdentityMap(ShapedType shapedType,
	VectorType vectorType);			VectorType vectorType);
	} // namespace impl			} // namespace impl
	} // end namespace vector			} // end namespace vector
	} // end namespace mlir			} // end namespace mlir

	#define GET_OP_CLASSES			#define GET_OP_CLASSES
	#include "mlir/Dialect/Vector/VectorOps.h.inc"			#include "mlir/Dialect/Vector/VectorOps.h.inc"
	#include "mlir/Dialect/Vector/VectorOpsDialect.h.inc"			#include "mlir/Dialect/Vector/VectorOpsDialect.h.inc"

	#endif // MLIR_DIALECT_VECTOR_VECTOROPS_H			#endif // MLIR_DIALECT_VECTOR_VECTOROPS_H

mlir/lib/Analysis/AffineAnalysis.cpp

Show First 20 Lines • Show All 64 Lines • ▼ Show 20 Lines	static Value getSupportedReduction(AffineForOp forOp, unsigned pos,
AtomicRMWKind &kind) {		AtomicRMWKind &kind) {
auto yieldOp = cast<AffineYieldOp>(forOp.getBody()->back());		auto yieldOp = cast<AffineYieldOp>(forOp.getBody()->back());
Value yielded = yieldOp.operands()[pos];		Value yielded = yieldOp.operands()[pos];
Operation *definition = yielded.getDefiningOp();		Operation *definition = yielded.getDefiningOp();
if (!definition)		if (!definition)
return nullptr;		return nullptr;
if (!forOp.getRegionIterArgs()[pos].hasOneUse())		if (!forOp.getRegionIterArgs()[pos].hasOneUse())
return nullptr;		return nullptr;
		if (!yielded.hasOneUse())
		return nullptr;

Optional<AtomicRMWKind> maybeKind =		Optional<AtomicRMWKind> maybeKind =
TypeSwitch<Operation *, Optional<AtomicRMWKind>>(definition)		TypeSwitch<Operation *, Optional<AtomicRMWKind>>(definition)
.Case<AddFOp>([](Operation *) { return AtomicRMWKind::addf; })		.Case<AddFOp>([](Operation *) { return AtomicRMWKind::addf; })
.Case<MulFOp>([](Operation *) { return AtomicRMWKind::mulf; })		.Case<MulFOp>([](Operation *) { return AtomicRMWKind::mulf; })
.Case<AddIOp>([](Operation *) { return AtomicRMWKind::addi; })		.Case<AddIOp>([](Operation *) { return AtomicRMWKind::addi; })
.Case<MulIOp>([](Operation *) { return AtomicRMWKind::muli; })		.Case<MulIOp>([](Operation *) { return AtomicRMWKind::muli; })
.Default([](Operation *) -> Optional<AtomicRMWKind> {		.Default([](Operation *) -> Optional<AtomicRMWKind> {
Show All 37 Lines	if (parallelReductions) {
}		}

// Return later to allow for identifying all parallel reductions even if the		// Return later to allow for identifying all parallel reductions even if the
// loop is not parallel.		// loop is not parallel.
if (parallelReductions->size() != numIterArgs)		if (parallelReductions->size() != numIterArgs)
return false;		return false;
}		}

		// Check memory dependences.
		return isLoopMemoryParallel(forOp);
		}

		/// Returns true if `forOp' doesn't have memory dependences preventing
		/// parallelization. This function doesn't check iter_args and should be used
		/// only as a building block for full parallel-checking functions.
		bool mlir::isLoopMemoryParallel(AffineForOp forOp) {
// Collect all load and store ops in loop nest rooted at 'forOp'.		// Collect all load and store ops in loop nest rooted at 'forOp'.
SmallVector<Operation *, 8> loadAndStoreOps;		SmallVector<Operation *, 8> loadAndStoreOps;
auto walkResult = forOp.walk([&](Operation *op) -> WalkResult {		auto walkResult = forOp.walk([&](Operation *op) -> WalkResult {
if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))		if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
loadAndStoreOps.push_back(op);		loadAndStoreOps.push_back(op);
else if (!isa<AffineForOp, AffineYieldOp, AffineIfOp>(op) &&		else if (!isa<AffineForOp, AffineYieldOp, AffineIfOp>(op) &&
!MemoryEffectOpInterface::hasNoEffect(op))		!MemoryEffectOpInterface::hasNoEffect(op))
return WalkResult::interrupt();		return WalkResult::interrupt();
▲ Show 20 Lines • Show All 977 Lines • Show Last 20 Lines

mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp

Show First 20 Lines • Show All 361 Lines • ▼ Show 20 Lines	LogicalResult matchAndRewrite(AffineForOp op,
rewriter.eraseBlock(scfForOp.getBody());		rewriter.eraseBlock(scfForOp.getBody());
rewriter.inlineRegionBefore(op.region(), scfForOp.region(),		rewriter.inlineRegionBefore(op.region(), scfForOp.region(),
scfForOp.region().end());		scfForOp.region().end());
rewriter.replaceOp(op, scfForOp.results());		rewriter.replaceOp(op, scfForOp.results());
return success();		return success();
}		}
};		};

/// Returns the identity value associated with an AtomicRMWKind op.
static Value getIdentityValue(AtomicRMWKind op, Type resultType,
OpBuilder &builder, Location loc) {
switch (op) {
case AtomicRMWKind::addf:
return builder.create<ConstantOp>(loc, builder.getFloatAttr(resultType, 0));
case AtomicRMWKind::addi:
return builder.create<ConstantOp>(loc,
builder.getIntegerAttr(resultType, 0));
case AtomicRMWKind::mulf:
return builder.create<ConstantOp>(loc, builder.getFloatAttr(resultType, 1));
case AtomicRMWKind::muli:
return builder.create<ConstantOp>(loc,
builder.getIntegerAttr(resultType, 1));
// TODO: Add remaining reduction operations.
default:
(void)emitOptionalError(loc, "Reduction operation type not supported");
break;
}
return nullptr;
}

/// Return the value obtained by applying the reduction operation kind
/// associated with a binary AtomicRMWKind op to `lhs` and `rhs`.
static Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
Value lhs, Value rhs) {
switch (op) {
case AtomicRMWKind::addf:
return builder.create<AddFOp>(loc, lhs, rhs);
case AtomicRMWKind::addi:
return builder.create<AddIOp>(loc, lhs, rhs);
case AtomicRMWKind::mulf:
return builder.create<MulFOp>(loc, lhs, rhs);
case AtomicRMWKind::muli:
return builder.create<MulIOp>(loc, lhs, rhs);
// TODO: Add remaining reduction operations.
default:
(void)emitOptionalError(loc, "Reduction operation type not supported");
break;
}
return nullptr;
}

/// Convert an `affine.parallel` (loop nest) operation into a `scf.parallel`		/// Convert an `affine.parallel` (loop nest) operation into a `scf.parallel`
/// operation.		/// operation.
class AffineParallelLowering : public OpRewritePattern<AffineParallelOp> {		class AffineParallelLowering : public OpRewritePattern<AffineParallelOp> {
public:		public:
using OpRewritePattern<AffineParallelOp>::OpRewritePattern;		using OpRewritePattern<AffineParallelOp>::OpRewritePattern;

LogicalResult matchAndRewrite(AffineParallelOp op,		LogicalResult matchAndRewrite(AffineParallelOp op,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
▲ Show 20 Lines • Show All 389 Lines • Show Last 20 Lines

mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp

Show All 15 Lines
#include "mlir/Analysis/LoopAnalysis.h"		#include "mlir/Analysis/LoopAnalysis.h"
#include "mlir/Analysis/NestedMatcher.h"		#include "mlir/Analysis/NestedMatcher.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"		#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/Utils.h"		#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Vector/VectorOps.h"		#include "mlir/Dialect/Vector/VectorOps.h"
#include "mlir/Dialect/Vector/VectorUtils.h"		#include "mlir/Dialect/Vector/VectorUtils.h"
#include "mlir/IR/BlockAndValueMapping.h"		#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/Support/LLVM.h"		#include "mlir/Support/LLVM.h"
		#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"

using namespace mlir;		using namespace mlir;
using namespace vector;		using namespace vector;

///		///
/// Implements a high-level vectorization strategy on a Function.		/// Implements a high-level vectorization strategy on a Function.
/// The abstraction used is that of super-vectors, which provide a single,		/// The abstraction used is that of super-vectors, which provide a single,
▲ Show 20 Lines • Show All 287 Lines • ▼ Show 20 Lines
/// affine.for %i = %M to %N step 128 {		/// affine.for %i = %M to %N step 128 {
/// %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32>		/// %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32>
/// }		/// }
/// ```		/// ```
///		///
/// Unsupported cases, extensions, and work in progress (help welcome :-) ):		/// Unsupported cases, extensions, and work in progress (help welcome :-) ):
/// ========================================================================		/// ========================================================================
/// 1. lowering to concrete vector types for various HW;		/// 1. lowering to concrete vector types for various HW;
/// 2. reduction support;		/// 2. reduction support for n-D vectorization and non-unit steps;
/// 3. non-effecting padding during vector.transfer_read and filter during		/// 3. non-effecting padding during vector.transfer_read and filter during
/// vector.transfer_write;		/// vector.transfer_write;
/// 4. misalignment support vector.transfer_read / vector.transfer_write		/// 4. misalignment support vector.transfer_read / vector.transfer_write
/// (hopefully without read-modify-writes);		/// (hopefully without read-modify-writes);
/// 5. control-flow support;		/// 5. control-flow support;
/// 6. cost-models, heuristics and search;		/// 6. cost-models, heuristics and search;
/// 7. Op implementation, extensions and implication on memref views;		/// 7. Op implementation, extensions and implication on memref views;
/// 8. many TODOs left around.		/// 8. many TODOs left around.
▲ Show 20 Lines • Show All 146 Lines • ▼ Show 20 Lines
/// %c42 = constant 42 : index		/// %c42 = constant 42 : index
/// %9 = load %2[%c7, %c42] : memref<?x?xf32>		/// %9 = load %2[%c7, %c42] : memref<?x?xf32>
/// return %9 : f32		/// return %9 : f32
/// }		/// }
/// ```		/// ```
///		///
/// Of course, much more intricate n-D imperfectly-nested patterns can be		/// Of course, much more intricate n-D imperfectly-nested patterns can be
/// vectorized too and specified in a fully declarative fashion.		/// vectorized too and specified in a fully declarative fashion.
		///
		/// Reduction:
		/// ==========
		/// Vectorizing reduction loops along the reduction dimension is supported if:
		/// - the reduction kind is supported,
		/// - the vectorization is 1-D, and
		/// - the step size of the loop equals to one.
		///
		/// Comparing to the non-vector-dimension case, two additional things are done
		/// during vectorization of such loops:
		/// - The resulting vector returned from the loop is reduced to a scalar using
		/// `vector.reduce`.
		/// - In some cases a mask is applied to the vector yielded at the end of the
		/// loop to prevent garbage values from being written to the accumulator.
		///
		/// Reduction vectorization is switched off by default, it can be enabled by
		/// passing a map from loops to reductions to utility functions, or by passing
		/// `vectorize-reductions=true` to the vectorization pass.
		///
		/// Consider the following example:
		/// ```mlir
		/// func @vecred(%in: memref<512xf32>) -> f32 {
		/// %cst = constant 0.000000e+00 : f32
		/// %sum = affine.for %i = 0 to 500 iter_args(%part_sum = %cst) -> (f32) {
		/// %ld = affine.load %in[%i] : memref<512xf32>
		/// %cos = math.cos %ld : f32
		/// %add = addf %part_sum, %cos : f32
		/// affine.yield %add : f32
		/// }
		/// return %sum : f32
		/// }
		/// ```
		///
		/// The -affine-vectorize pass with the following arguments:
		/// ```
		/// -affine-vectorize="virtual-vector-size=128 test-fastest-varying=0 \
		/// vectorize-reductions=true"
		/// ```
		/// produces the following output:
		/// ```mlir
		/// #map = affine_map<(d0) -> (-d0 + 500)>
		/// func @vecred(%arg0: memref<512xf32>) -> f32 {
		/// %cst = constant 0.000000e+00 : f32
		/// %cst_0 = constant dense<0.000000e+00> : vector<128xf32>
		/// %0 = affine.for %arg1 = 0 to 500 step 128 iter_args(%arg2 = %cst_0)
		/// -> (vector<128xf32>) {
		/// // %2 is the number of iterations left in the original loop.
		/// %2 = affine.apply #map(%arg1)
		/// %3 = vector.create_mask %2 : vector<128xi1>
		/// %cst_1 = constant 0.000000e+00 : f32
		/// %4 = vector.transfer_read %arg0[%arg1], %cst_1 :
		/// memref<512xf32>, vector<128xf32>
		/// %5 = math.cos %4 : vector<128xf32>
		/// %6 = addf %arg2, %5 : vector<128xf32>
		/// // We filter out the effect of last 12 elements using the mask.
		/// %7 = select %3, %6, %arg2 : vector<128xi1>, vector<128xf32>
		/// affine.yield %7 : vector<128xf32>
		/// }
		/// %1 = vector.reduction "add", %0 : vector<128xf32> into f32
		/// return %1 : f32
		/// }
		/// ```
		///
		/// Note that because of loop misalignment we needed to apply a mask to prevent
		/// last 12 elements from affecting the final result. The mask is full of ones
		/// in every iteration except for the last one, in which it has the form
		/// `11...100...0` with 116 ones and 12 zeros.

#define DEBUG_TYPE "early-vect"		#define DEBUG_TYPE "early-vect"

using llvm::dbgs;		using llvm::dbgs;

/// Forward declaration.		/// Forward declaration.
static FilterFunctionType		static FilterFunctionType
isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,		isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,
▲ Show 20 Lines • Show All 141 Lines • ▼ Show 20 Lines	struct VectorizationState {
/// within the vector loop.		/// within the vector loop.
///		///
/// Example:		/// Example:
/// * 'replaced': induction variable of a loop to be vectorized.		/// * 'replaced': induction variable of a loop to be vectorized.
/// * 'replacement': new induction variable in the new vector loop.		/// * 'replacement': new induction variable in the new vector loop.
void registerValueScalarReplacement(BlockArgument replaced,		void registerValueScalarReplacement(BlockArgument replaced,
BlockArgument replacement);		BlockArgument replacement);

		/// Registers the scalar replacement of a scalar result returned from a
		/// reduction loop. 'replacement' must be scalar.
		///
		/// This utility is used to register the replacement for scalar results of
		/// vectorized reduction loops with iter_args.
		///
		/// Example 2:
		/// * 'replaced': %0 = affine.for %i = 0 to 512 iter_args(%x = ...) -> (f32)
		/// * 'replacement': %1 = vector.reduction "add" %0 : vector<4xf32> into f32
		void registerLoopResultScalarReplacement(Value replaced, Value replacement);

/// Returns in 'replacedVals' the scalar replacement for values in		/// Returns in 'replacedVals' the scalar replacement for values in
/// 'inputVals'.		/// 'inputVals'.
void getScalarValueReplacementsFor(ValueRange inputVals,		void getScalarValueReplacementsFor(ValueRange inputVals,
SmallVectorImpl<Value> &replacedVals);		SmallVectorImpl<Value> &replacedVals);

/// Erases the scalar loop nest after its successful vectorization.		/// Erases the scalar loop nest after its successful vectorization.
void finishVectorizationPattern(AffineForOp rootLoop);		void finishVectorizationPattern(AffineForOp rootLoop);

// Used to build and insert all the new operations created. The insertion		// Used to build and insert all the new operations created. The insertion
// point is preserved and updated along the vectorization process.		// point is preserved and updated along the vectorization process.
OpBuilder builder;		OpBuilder builder;

// Maps input scalar operations to their vector counterparts.		// Maps input scalar operations to their vector counterparts.
DenseMap<Operation , Operation > opVectorReplacement;		DenseMap<Operation , Operation > opVectorReplacement;
// Maps input scalar values to their vector counterparts.		// Maps input scalar values to their vector counterparts.
BlockAndValueMapping valueVectorReplacement;		BlockAndValueMapping valueVectorReplacement;
// Maps input scalar values to their new scalar counterparts in the vector		// Maps input scalar values to their new scalar counterparts in the vector
// loop nest.		// loop nest.
BlockAndValueMapping valueScalarReplacement;		BlockAndValueMapping valueScalarReplacement;
		// Maps results of reduction loops to their new scalar counterparts.
		DenseMap<Value, Value> loopResultScalarReplacement;

// Maps the newly created vector loops to their vector dimension.		// Maps the newly created vector loops to their vector dimension.
DenseMap<Operation *, unsigned> vecLoopToVecDim;		DenseMap<Operation *, unsigned> vecLoopToVecDim;

		// Maps the new vectorized loops to the corresponding vector masks if it is
		// required.
		DenseMap<Operation *, Value> vecLoopToMask;

// The strategy drives which loop to vectorize by which amount.		// The strategy drives which loop to vectorize by which amount.
const VectorizationStrategy *strategy;		const VectorizationStrategy *strategy;

private:		private:
/// Internal implementation to map input scalar values to new vector or scalar		/// Internal implementation to map input scalar values to new vector or scalar
/// values.		/// values.
void registerValueVectorReplacementImpl(Value replaced, Value replacement);		void registerValueVectorReplacementImpl(Value replaced, Value replacement);
void registerValueScalarReplacementImpl(Value replaced, Value replacement);		void registerValueScalarReplacementImpl(Value replaced, Value replacement);
▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines
/// Example:		/// Example:
/// * 'replaced': induction variable of a loop to be vectorized.		/// * 'replaced': induction variable of a loop to be vectorized.
/// * 'replacement': new induction variable in the new vector loop.		/// * 'replacement': new induction variable in the new vector loop.
void VectorizationState::registerValueScalarReplacement(		void VectorizationState::registerValueScalarReplacement(
BlockArgument replaced, BlockArgument replacement) {		BlockArgument replaced, BlockArgument replacement) {
registerValueScalarReplacementImpl(replaced, replacement);		registerValueScalarReplacementImpl(replaced, replacement);
}		}

		/// Registers the scalar replacement of a scalar result returned from a
		/// reduction loop. 'replacement' must be scalar.
		///
		/// This utility is used to register the replacement for scalar results of
		/// vectorized reduction loops with iter_args.
		///
		/// Example 2:
		/// * 'replaced': %0 = affine.for %i = 0 to 512 iter_args(%x = ...) -> (f32)
		/// * 'replacement': %1 = vector.reduction "add" %0 : vector<4xf32> into f32
		void VectorizationState::registerLoopResultScalarReplacement(
		Value replaced, Value replacement) {
		assert(isa<AffineForOp>(replaced.getDefiningOp()));
		assert(loopResultScalarReplacement.count(replaced) == 0 &&
		"already registered");
		LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ will replace a result of the loop "
		"with scalar: "
		<< replacement);
		loopResultScalarReplacement[replaced] = replacement;
		}

void VectorizationState::registerValueScalarReplacementImpl(Value replaced,		void VectorizationState::registerValueScalarReplacementImpl(Value replaced,
Value replacement) {		Value replacement) {
assert(!valueScalarReplacement.contains(replaced) &&		assert(!valueScalarReplacement.contains(replaced) &&
"Scalar value replacement already registered");		"Scalar value replacement already registered");
assert(!replacement.getType().isa<VectorType>() &&		assert(!replacement.getType().isa<VectorType>() &&
"Expected scalar type in scalar replacement");		"Expected scalar type in scalar replacement");
valueScalarReplacement.map(replaced, replacement);		valueScalarReplacement.map(replaced, replacement);
}		}
▲ Show 20 Lines • Show All 73 Lines • ▼ Show 20 Lines	static ConstantOp vectorizeConstant(ConstantOp constOp,
auto vecAttr = DenseElementsAttr::get(vecTy, constOp.getValue());		auto vecAttr = DenseElementsAttr::get(vecTy, constOp.getValue());
auto newConstOp = state.builder.create<ConstantOp>(constOp.getLoc(), vecAttr);		auto newConstOp = state.builder.create<ConstantOp>(constOp.getLoc(), vecAttr);

// Register vector replacement for future uses in the scope.		// Register vector replacement for future uses in the scope.
state.registerOpVectorReplacement(constOp, newConstOp);		state.registerOpVectorReplacement(constOp, newConstOp);
return newConstOp;		return newConstOp;
}		}

		/// Creates a constant vector filled with the neutral elements of the given
		/// reduction. The scalar type of vector elements will be taken from
		/// `oldOperand`.
		static ConstantOp createInitialVector(AtomicRMWKind reductionKind,
		Value oldOperand,
		VectorizationState &state) {
		Type scalarTy = oldOperand.getType();
		if (!VectorType::isValidElementType(scalarTy))
		return nullptr;

		Attribute valueAttr = getIdentityValueAttr(
		reductionKind, scalarTy, state.builder, oldOperand.getLoc());
		auto vecTy = getVectorType(scalarTy, state.strategy);
		auto vecAttr = DenseElementsAttr::get(vecTy, valueAttr);
		auto newConstOp =
		state.builder.create<ConstantOp>(oldOperand.getLoc(), vecAttr);

		return newConstOp;
		}

		/// Creates a mask used to filter out garbage elements in the last iteration
		/// of unaligned loops. If a mask is not required then `nullptr` is returned.
		/// The mask will be a vector of booleans representing meaningful vector
		/// elements in the current iteration. It is filled with ones for each iteration
		/// except for the last one, where it has the form `11...100...0` with the
		/// number of ones equal to the number of meaningful elements (i.e. the number
		/// of iterations that would be left in the original loop).
		static Value createMask(AffineForOp vecForOp, VectorizationState &state) {
		assert(state.strategy->vectorSizes.size() == 1 &&
		"Creating a mask non-1-D vectors is not supported.");
		assert(vecForOp.getStep() == state.strategy->vectorSizes[0] &&
		"Creating a mask for loops with non-unit original step size is not "
		"supported.");

		// Check if we have already created the mask.
		if (Value mask = state.vecLoopToMask.lookup(vecForOp))
		return mask;

		// If the loop has constant bounds and the original number of iterations is
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions Note: there is also the case where all the `vector.transfer_read` in the backward slice of the reduction (intersected with the filter "nested under loop of interest") have `inBounds == true`. This is the way we can inject static information into dynamic memrefs atm. nicolasvasilache: Note: there is also the case where all the `vector.transfer_read` in the backward slice of the…
		sgrechanikAuthorUnsubmitted Done Reply Inline Actions Not sure I understand your comment correctly, but there may be cases when the read is within bounds, but we still need to filter out some elements for the reduction (e.g. when the dimension size of a memref is larger than the upper bound of the corresponding loop). sgrechanik: Not sure I understand your comment correctly, but there may be cases when the read is within…
		// divisable by the vector size then we don't need a mask.
		if (vecForOp.hasConstantBounds()) {
		int64_t originalTripCount =
		vecForOp.getConstantUpperBound() - vecForOp.getConstantLowerBound();
		if (originalTripCount % vecForOp.getStep() == 0)
		return nullptr;
		}

		OpBuilder::InsertionGuard guard(state.builder);
		state.builder.setInsertionPointToStart(vecForOp.getBody());

		// We generate the mask using the `vector.create_mask` operation which accepts
		// the number of meaningful elements (i.e. the legth of the prefix of 1s).
		// To compute the number of meaningful elements we subtract the current value
		// of the iteration variable from the upper bound of the loop. Example:
		//
		// // 500 is the upper bound of the loop
		// #map = affine_map<(d0) -> (500 - d0)>
		// %elems_left = affine.apply #map(%iv)
		// %mask = vector.create_mask %elems_left : vector<128xi1>

		Location loc = vecForOp.getLoc();

		// First we get the upper bound of the loop using `affine.apply` or
		// `affine.min`.
		AffineMap ubMap = vecForOp.getUpperBoundMap();
		Value ub;
		if (ubMap.getNumResults() == 1)
		ub = state.builder.create<AffineApplyOp>(loc, vecForOp.getUpperBoundMap(),
		vecForOp.getUpperBoundOperands());
		else
		ub = state.builder.create<AffineMinOp>(loc, vecForOp.getUpperBoundMap(),
		vecForOp.getUpperBoundOperands());
		// Then we compute the number of (original) iterations left in the loop.
		AffineExpr subExpr =
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions for composability you prob. want this to be an AffineApplyOp itself. nicolasvasilache: for composability you prob. want this to be an AffineApplyOp itself.
		sgrechanikAuthorUnsubmitted Done Reply Inline Actions Yes, makes sense. sgrechanik: Yes, makes sense.
		state.builder.getAffineDimExpr(0) - state.builder.getAffineDimExpr(1);
		Value itersLeft =
		makeComposedAffineApply(state.builder, loc, AffineMap::get(2, 0, subExpr),
		{ub, vecForOp.getInductionVar()});
		// If the affine maps were successfully composed then `ub` is unneeded.
		if (ub.use_empty())
		ub.getDefiningOp()->erase();
		// Finally we create the mask.
		Type maskTy = VectorType::get(state.strategy->vectorSizes,
		state.builder.getIntegerType(1));
		Value mask =
		state.builder.create<vector::CreateMaskOp>(loc, maskTy, itersLeft);

		LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a mask:\n"
		<< itersLeft << "\n"
		<< mask << "\n");

		state.vecLoopToMask[vecForOp] = mask;
		return mask;
		}

/// Returns true if the provided value is vector uniform given the vectorization		/// Returns true if the provided value is vector uniform given the vectorization
/// strategy.		/// strategy.
// TODO: For now, only values that are invariants to all the loops in the		// TODO: For now, only values that are invariants to all the loops in the
// vectorization strategy are considered vector uniforms.		// vectorization strategy are considered vector uniforms.
static bool isUniformDefinition(Value value,		static bool isUniformDefinition(Value value,
const VectorizationStrategy *strategy) {		const VectorizationStrategy *strategy) {
for (auto loopToDim : strategy->loopToVectorDim) {		for (auto loopToDim : strategy->loopToVectorDim) {
auto loop = cast<AffineForOp>(loopToDim.first);		auto loop = cast<AffineForOp>(loopToDim.first);
▲ Show 20 Lines • Show All 157 Lines • ▼ Show 20 Lines	auto transfer = state.builder.create<vector::TransferWriteOp>(
permutationMap);		permutationMap);
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << transfer);		LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << transfer);

// Register replacement for future uses in the scope.		// Register replacement for future uses in the scope.
state.registerOpVectorReplacement(storeOp, transfer);		state.registerOpVectorReplacement(storeOp, transfer);
return transfer;		return transfer;
}		}

		/// Returns true if `value` is a constant equal to the neutral element of the
		/// given vectorizable reduction.
		static bool isNeutralElementConst(AtomicRMWKind reductionKind, Value value,
		VectorizationState &state) {
		Type scalarTy = value.getType();
		if (!VectorType::isValidElementType(scalarTy))
		return false;
		Attribute valueAttr = getIdentityValueAttr(reductionKind, scalarTy,
		state.builder, value.getLoc());
		if (auto constOp = dyn_cast_or_null<ConstantOp>(value.getDefiningOp()))
		return constOp.value() == valueAttr;
		return false;
		}

/// Vectorizes a loop with the vectorization strategy in 'state'. A new loop is		/// Vectorizes a loop with the vectorization strategy in 'state'. A new loop is
/// created and registered as replacement for the scalar loop. The builder's		/// created and registered as replacement for the scalar loop. The builder's
/// insertion point is set to the new loop's body so that subsequent vectorized		/// insertion point is set to the new loop's body so that subsequent vectorized
/// operations are inserted into the new loop. If the loop is a vector		/// operations are inserted into the new loop. If the loop is a vector
/// dimension, the step of the newly created loop will reflect the vectorization		/// dimension, the step of the newly created loop will reflect the vectorization
/// factor used to vectorized that dimension.		/// factor used to vectorized that dimension.
// TODO: Add support for 'iter_args'. Related operands and results will be
// vectorized at this point.
static Operation *vectorizeAffineForOp(AffineForOp forOp,		static Operation *vectorizeAffineForOp(AffineForOp forOp,
VectorizationState &state) {		VectorizationState &state) {
const VectorizationStrategy &strategy = *state.strategy;		const VectorizationStrategy &strategy = *state.strategy;
auto loopToVecDimIt = strategy.loopToVectorDim.find(forOp);		auto loopToVecDimIt = strategy.loopToVectorDim.find(forOp);
bool isLoopVecDim = loopToVecDimIt != strategy.loopToVectorDim.end();		bool isLoopVecDim = loopToVecDimIt != strategy.loopToVectorDim.end();

// We only support 'iter_args' when the loop is not one of the vector		// TODO: Vectorization of reduction loops is not supported for non-unit steps.
// dimensions.		if (isLoopVecDim && forOp.getNumIterOperands() > 0 && forOp.getStep() != 1) {
// TODO: Support vector dimension loops. They require special handling:		LLVM_DEBUG(
// generate horizontal reduction, last-value extraction, etc.		dbgs()
if (forOp.getNumIterOperands() > 0 && isLoopVecDim)		<< "\n[early-vect]+++++ unsupported step size for reduction loop: "
		<< forOp.getStep() << "\n");
return nullptr;		return nullptr;
		}

// If we are vectorizing a vector dimension, compute a new step for the new		// If we are vectorizing a vector dimension, compute a new step for the new
// vectorized loop using the vectorization factor for the vector dimension.		// vectorized loop using the vectorization factor for the vector dimension.
// Otherwise, propagate the step of the scalar loop.		// Otherwise, propagate the step of the scalar loop.
unsigned newStep;		unsigned newStep;
if (isLoopVecDim) {		if (isLoopVecDim) {
unsigned vectorDim = loopToVecDimIt->second;		unsigned vectorDim = loopToVecDimIt->second;
assert(vectorDim < strategy.vectorSizes.size() && "vector dim overflow");		assert(vectorDim < strategy.vectorSizes.size() && "vector dim overflow");
int64_t forOpVecFactor = strategy.vectorSizes[vectorDim];		int64_t forOpVecFactor = strategy.vectorSizes[vectorDim];
newStep = forOp.getStep() * forOpVecFactor;		newStep = forOp.getStep() * forOpVecFactor;
} else {		} else {
newStep = forOp.getStep();		newStep = forOp.getStep();
}		}

		// Get information about reduction kinds.
		ArrayRef<LoopReduction> reductions;
		if (isLoopVecDim && forOp.getNumIterOperands() > 0) {
		auto it = strategy.reductionLoops.find(forOp);
		assert(it != strategy.reductionLoops.end() &&
		"Reduction descriptors not found when vectorizing a reduction loop");
		reductions = it->second;
		assert(reductions.size() == forOp.getNumIterOperands() &&
		"The size of reductions array must match the number of iter_args");
		}

// Vectorize 'iter_args'.		// Vectorize 'iter_args'.
SmallVector<Value, 8> vecIterOperands;		SmallVector<Value, 8> vecIterOperands;
		if (!isLoopVecDim) {
for (auto operand : forOp.getIterOperands())		for (auto operand : forOp.getIterOperands())
vecIterOperands.push_back(vectorizeOperand(operand, state));		vecIterOperands.push_back(vectorizeOperand(operand, state));
		} else {
		// For reduction loops we need to pass a vector of neutral elements as an
		// initial value of the accumulator. We will add the original initial value
		// later.
		for (auto redAndOperand : llvm::zip(reductions, forOp.getIterOperands())) {
		vecIterOperands.push_back(createInitialVector(
		std::get<0>(redAndOperand).kind, std::get<1>(redAndOperand), state));
		}
		}

auto vecForOp = state.builder.create<AffineForOp>(		auto vecForOp = state.builder.create<AffineForOp>(
forOp.getLoc(), forOp.getLowerBoundOperands(), forOp.getLowerBoundMap(),		forOp.getLoc(), forOp.getLowerBoundOperands(), forOp.getLowerBoundMap(),
forOp.getUpperBoundOperands(), forOp.getUpperBoundMap(), newStep,		forOp.getUpperBoundOperands(), forOp.getUpperBoundMap(), newStep,
vecIterOperands,		vecIterOperands,
/bodyBuilder=/[](OpBuilder &, Location, Value, ValueRange) {		/bodyBuilder=/[](OpBuilder &, Location, Value, ValueRange) {
// Make sure we don't create a default terminator in the loop body as		// Make sure we don't create a default terminator in the loop body as
// the proper terminator will be added during vectorization.		// the proper terminator will be added during vectorization.
return;		return;
});		});

// Register loop-related replacements:		// Register loop-related replacements:
// 1) The new vectorized loop is registered as vector replacement of the		// 1) The new vectorized loop is registered as vector replacement of the
// scalar loop.		// scalar loop.
// TODO: Support reductions along the vector dimension.
// 2) The new iv of the vectorized loop is registered as scalar replacement		// 2) The new iv of the vectorized loop is registered as scalar replacement
// since a scalar copy of the iv will prevail in the vectorized loop.		// since a scalar copy of the iv will prevail in the vectorized loop.
// TODO: A vector replacement will also be added in the future when		// TODO: A vector replacement will also be added in the future when
// vectorization of linear ops is supported.		// vectorization of linear ops is supported.
// 3) The new 'iter_args' region arguments are registered as vector		// 3) The new 'iter_args' region arguments are registered as vector
// replacements since they have been vectorized.		// replacements since they have been vectorized.
		// 4) If the loop performs a reduction along the vector dimension, a
		// `vector.reduction` or similar op is inserted for each resulting value
		// of the loop and its scalar value replaces the corresponding scalar
		// result of the loop.
state.registerOpVectorReplacement(forOp, vecForOp);		state.registerOpVectorReplacement(forOp, vecForOp);
state.registerValueScalarReplacement(forOp.getInductionVar(),		state.registerValueScalarReplacement(forOp.getInductionVar(),
vecForOp.getInductionVar());		vecForOp.getInductionVar());
for (auto iterTuple :		for (auto iterTuple :
llvm ::zip(forOp.getRegionIterArgs(), vecForOp.getRegionIterArgs()))		llvm ::zip(forOp.getRegionIterArgs(), vecForOp.getRegionIterArgs()))
state.registerBlockArgVectorReplacement(std::get<0>(iterTuple),		state.registerBlockArgVectorReplacement(std::get<0>(iterTuple),
std::get<1>(iterTuple));		std::get<1>(iterTuple));

		if (isLoopVecDim) {
		for (unsigned i = 0; i < vecForOp.getNumIterOperands(); ++i) {
		// First, we reduce the vector returned from the loop into a scalar.
		Value reducedRes =
		getVectorReductionOp(reductions[i].kind, state.builder,
		vecForOp.getLoc(), vecForOp.getResult(i));
		LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a vector reduction: "
		<< reducedRes);
		// Then we combine it with the original (scalar) initial value unless it
		// is equal to the neutral element of the reduction.
		Value origInit = forOp.getOperand(forOp.getNumControlOperands() + i);
		Value finalRes = reducedRes;
		if (!isNeutralElementConst(reductions[i].kind, origInit, state))
		finalRes = getReductionOp(reductions[i].kind, state.builder,
		reducedRes.getLoc(), reducedRes, origInit);
		state.registerLoopResultScalarReplacement(forOp.getResult(i), finalRes);
		}
		}

if (isLoopVecDim)		if (isLoopVecDim)
state.vecLoopToVecDim[vecForOp] = loopToVecDimIt->second;		state.vecLoopToVecDim[vecForOp] = loopToVecDimIt->second;

// Change insertion point so that upcoming vectorized instructions are		// Change insertion point so that upcoming vectorized instructions are
// inserted into the vectorized loop's body.		// inserted into the vectorized loop's body.
state.builder.setInsertionPointToStart(vecForOp.getBody());		state.builder.setInsertionPointToStart(vecForOp.getBody());

		// If this is a reduction loop then we may need to create a mask to filter out
		// garbage in the last iteration.
		if (isLoopVecDim && forOp.getNumIterOperands() > 0)
		createMask(vecForOp, state);

return vecForOp;		return vecForOp;
}		}

/// Vectorizes arbitrary operation by plain widening. We apply generic type		/// Vectorizes arbitrary operation by plain widening. We apply generic type
/// widening of all its results and retrieve the vector counterparts for all its		/// widening of all its results and retrieve the vector counterparts for all its
/// operands.		/// operands.
static Operation widenOp(Operation op, VectorizationState &state) {		static Operation widenOp(Operation op, VectorizationState &state) {
SmallVector<Type, 8> vectorTypes;		SmallVector<Type, 8> vectorTypes;
Show All 21 Lines	OperationState vecOpState(op->getLoc(), op->getName().getStringRef(),
/successors=/{}, /regions=/{});		/successors=/{}, /regions=/{});
Operation *vecOp = state.builder.createOperation(vecOpState);		Operation *vecOp = state.builder.createOperation(vecOpState);
state.registerOpVectorReplacement(op, vecOp);		state.registerOpVectorReplacement(op, vecOp);
return vecOp;		return vecOp;
}		}

/// Vectorizes a yield operation by widening its types. The builder's insertion		/// Vectorizes a yield operation by widening its types. The builder's insertion
/// point is set after the vectorized parent op to continue vectorizing the		/// point is set after the vectorized parent op to continue vectorizing the
/// operations after the parent op.		/// operations after the parent op. When vectorizing a reduction loop a mask may
		/// be used to prevent adding garbage values to the accumulator.
static Operation *vectorizeAffineYieldOp(AffineYieldOp yieldOp,		static Operation *vectorizeAffineYieldOp(AffineYieldOp yieldOp,
VectorizationState &state) {		VectorizationState &state) {
Operation *newYieldOp = widenOp(yieldOp, state);		Operation *newYieldOp = widenOp(yieldOp, state);
Operation *newParentOp = state.builder.getInsertionBlock()->getParentOp();		Operation *newParentOp = state.builder.getInsertionBlock()->getParentOp();

		// If there is a mask for this loop then we must prevent garbage values from
		// being added to the accumulator by inserting `select` operations, for
		// example:
		//
		// %res = addf %acc, %val : vector<128xf32>
		// %res_masked = select %mask, %res, %acc : vector<128xi1>, vector<128xf32>
		// affine.yield %res_masked : vector<128xf32>
		//
		if (Value mask = state.vecLoopToMask.lookup(newParentOp)) {
		state.builder.setInsertionPoint(newYieldOp);
		for (unsigned i = 0; i < newYieldOp->getNumOperands(); ++i) {
		Value result = newYieldOp->getOperand(i);
		Value iterArg = cast<AffineForOp>(newParentOp).getRegionIterArgs()[i];
		Value maskedResult = state.builder.create<SelectOp>(result.getLoc(), mask,
		result, iterArg);
		LLVM_DEBUG(
		dbgs() << "\n[early-vect]+++++ masking a yielded vector value: "
		<< maskedResult);
		newYieldOp->setOperand(i, maskedResult);
		}
		}

state.builder.setInsertionPointAfter(newParentOp);		state.builder.setInsertionPointAfter(newParentOp);
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions Note: some of this is quite ancient and predates `OpBuilder::InsertionGuard`. We should cleanup the load-bearing "insertion point"-passing across function boundaries via `state.builder` at some point. nicolasvasilache: Note: some of this is quite ancient and predates `OpBuilder::InsertionGuard`. We should cleanup…
return newYieldOp;		return newYieldOp;
}		}

/// Encodes Operation-specific behavior for vectorization. In general we		/// Encodes Operation-specific behavior for vectorization. In general we
/// assume that all operands of an op must be vectorized but this is not		/// assume that all operands of an op must be vectorized but this is not
/// always true. In the future, it would be nice to have a trait that		/// always true. In the future, it would be nice to have a trait that
/// describes how a particular operation vectorizes. For now we implement the		/// describes how a particular operation vectorizes. For now we implement the
/// case distinction here. Returns a vectorized form of an operation or		/// case distinction here. Returns a vectorized form of an operation or
▲ Show 20 Lines • Show All 86 Lines • ▼ Show 20 Lines	vectorizeLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
// vectorization succeeds, the scalar loop nest is erased. If vectorization		// vectorization succeeds, the scalar loop nest is erased. If vectorization
// fails, the vector loop nest is erased and the scalar loop nest is not		// fails, the vector loop nest is erased and the scalar loop nest is not
// modified.		// modified.
//////////////////////////////////////////////////////////////////////////////		//////////////////////////////////////////////////////////////////////////////

auto opVecResult = rootLoop.walk<WalkOrder::PreOrder>([&](Operation *op) {		auto opVecResult = rootLoop.walk<WalkOrder::PreOrder>([&](Operation *op) {
LLVM_DEBUG(dbgs() << "[early-vect]+++++ Vectorizing: " << *op);		LLVM_DEBUG(dbgs() << "[early-vect]+++++ Vectorizing: " << *op);
Operation *vectorOp = vectorizeOneOperation(op, state);		Operation *vectorOp = vectorizeOneOperation(op, state);
if (!vectorOp)		if (!vectorOp) {
		LLVM_DEBUG(
		dbgs() << "[early-vect]+++++ failed vectorizing the operation: "
		<< *op << "\n");
return WalkResult::interrupt();		return WalkResult::interrupt();
		}

return WalkResult::advance();		return WalkResult::advance();
});		});

if (opVecResult.wasInterrupted()) {		if (opVecResult.wasInterrupted()) {
LLVM_DEBUG(dbgs() << "[early-vect]+++++ failed vectorization for: "		LLVM_DEBUG(dbgs() << "[early-vect]+++++ failed vectorization for: "
<< rootLoop << "\n");		<< rootLoop << "\n");
// Erase vector loop nest if it was created.		// Erase vector loop nest if it was created.
auto vecRootLoopIt = state.opVectorReplacement.find(rootLoop);		auto vecRootLoopIt = state.opVectorReplacement.find(rootLoop);
if (vecRootLoopIt != state.opVectorReplacement.end())		if (vecRootLoopIt != state.opVectorReplacement.end())
eraseLoopNest(cast<AffineForOp>(vecRootLoopIt->second));		eraseLoopNest(cast<AffineForOp>(vecRootLoopIt->second));

return failure();		return failure();
}		}

		// Replace results of reduction loops with the scalar values computed using
		// `vector.reduce` or similar ops.
		for (auto resPair : state.loopResultScalarReplacement)
		resPair.first.replaceAllUsesWith(resPair.second);

assert(state.opVectorReplacement.count(rootLoop) == 1 &&		assert(state.opVectorReplacement.count(rootLoop) == 1 &&
"Expected vector replacement for loop nest");		"Expected vector replacement for loop nest");
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");		LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorization result:\n"		LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorization result:\n"
<< *state.opVectorReplacement[rootLoop]);		<< *state.opVectorReplacement[rootLoop]);

// Finish this vectorization pattern.		// Finish this vectorization pattern.
state.finishVectorizationPattern(rootLoop);		state.finishVectorizationPattern(rootLoop);
▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines	for (const NestedMatch &match : matches) {
}		}
}		}
}		}

/// Internal implementation to vectorize affine loops in 'loops' using the n-D		/// Internal implementation to vectorize affine loops in 'loops' using the n-D
/// vectorization factors in 'vectorSizes'. By default, each vectorization		/// vectorization factors in 'vectorSizes'. By default, each vectorization
/// factor is applied inner-to-outer to the loops of each loop nest.		/// factor is applied inner-to-outer to the loops of each loop nest.
/// 'fastestVaryingPattern' can be optionally used to provide a different loop		/// 'fastestVaryingPattern' can be optionally used to provide a different loop
/// vectorization order.		/// vectorization order. `reductionLoops` can be provided to specify loops which
		/// can be vectorized along the reduction dimension.
static void vectorizeLoops(Operation parentOp, DenseSet<Operation > &loops,		static void vectorizeLoops(Operation parentOp, DenseSet<Operation > &loops,
ArrayRef<int64_t> vectorSizes,		ArrayRef<int64_t> vectorSizes,
ArrayRef<int64_t> fastestVaryingPattern) {		ArrayRef<int64_t> fastestVaryingPattern,
		const ReductionLoopMap &reductionLoops) {
		assert((reductionLoops.empty() \|\| vectorSizes.size() == 1) &&
		"Vectorizing reductions is supported only for 1-D vectors");

// Compute 1-D, 2-D or 3-D loop pattern to be matched on the target loops.		// Compute 1-D, 2-D or 3-D loop pattern to be matched on the target loops.
Optional<NestedPattern> pattern =		Optional<NestedPattern> pattern =
makePattern(loops, vectorSizes.size(), fastestVaryingPattern);		makePattern(loops, vectorSizes.size(), fastestVaryingPattern);
if (!pattern.hasValue()) {		if (!pattern.hasValue()) {
LLVM_DEBUG(dbgs() << "\n[early-vect] pattern couldn't be computed\n");		LLVM_DEBUG(dbgs() << "\n[early-vect] pattern couldn't be computed\n");
return;		return;
}		}

Show All 14 Lines	static void vectorizeLoops(Operation parentOp, DenseSet<Operation > &loops,
// Iterate over all buckets and vectorize the matches eagerly. We can only		// Iterate over all buckets and vectorize the matches eagerly. We can only
// vectorize one match from each bucket since all the matches within a bucket		// vectorize one match from each bucket since all the matches within a bucket
// intersect.		// intersect.
for (auto &intersectingMatches : intersectionBuckets) {		for (auto &intersectingMatches : intersectionBuckets) {
for (NestedMatch &match : intersectingMatches) {		for (NestedMatch &match : intersectingMatches) {
VectorizationStrategy strategy;		VectorizationStrategy strategy;
// TODO: depending on profitability, elect to reduce the vector size.		// TODO: depending on profitability, elect to reduce the vector size.
strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end());		strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end());
		strategy.reductionLoops = reductionLoops;
if (failed(analyzeProfitability(match.getMatchedChildren(), 1,		if (failed(analyzeProfitability(match.getMatchedChildren(), 1,
patternDepth, &strategy))) {		patternDepth, &strategy))) {
continue;		continue;
}		}
vectorizeLoopIfProfitable(match.getMatchedOperation(), 0, patternDepth,		vectorizeLoopIfProfitable(match.getMatchedOperation(), 0, patternDepth,
&strategy);		&strategy);
// Vectorize match. Skip the rest of intersecting matches in the bucket if		// Vectorize match. Skip the rest of intersecting matches in the bucket if
// vectorization succeeded.		// vectorization succeeded.
Show All 21 Lines	void Vectorize::runOnFunction() {
FuncOp f = getFunction();		FuncOp f = getFunction();
if (!fastestVaryingPattern.empty() &&		if (!fastestVaryingPattern.empty() &&
fastestVaryingPattern.size() != vectorSizes.size()) {		fastestVaryingPattern.size() != vectorSizes.size()) {
f.emitRemark("Fastest varying pattern specified with different size than "		f.emitRemark("Fastest varying pattern specified with different size than "
"the vector size.");		"the vector size.");
return signalPassFailure();		return signalPassFailure();
}		}

		if (vectorizeReductions && vectorSizes.size() != 1) {
		f.emitError("Vectorizing reductions is supported only for 1-D vectors.");
		return signalPassFailure();
		}

DenseSet<Operation *> parallelLoops;		DenseSet<Operation *> parallelLoops;
		ReductionLoopMap reductionLoops;

		// If 'vectorize-reduction=true' is provided, we also populate the
		// `reductionLoops` map.
		if (vectorizeReductions) {
		f.walk([&parallelLoops, &reductionLoops](AffineForOp loop) {
		SmallVector<LoopReduction, 2> reductions;
		if (isLoopParallel(loop, &reductions)) {
		parallelLoops.insert(loop);
		// If it's not a reduction loop, adding it to the map is not necessary.
		if (!reductions.empty())
		reductionLoops[loop] = reductions;
		}
		});
		} else {
f.walk([&parallelLoops](AffineForOp loop) {		f.walk([&parallelLoops](AffineForOp loop) {
if (isLoopParallel(loop))		if (isLoopParallel(loop))
parallelLoops.insert(loop);		parallelLoops.insert(loop);
});		});
		}

// Thread-safe RAII local context, BumpPtrAllocator freed on exit.		// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
NestedPatternContext mlContext;		NestedPatternContext mlContext;
vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);		vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern,
		reductionLoops);
}		}

/// Verify that affine loops in 'loops' meet the nesting criteria expected by		/// Verify that affine loops in 'loops' meet the nesting criteria expected by
/// SuperVectorizer:		/// SuperVectorizer:
/// * There must be at least one loop.		/// * There must be at least one loop.
/// * There must be a single root loop (nesting level 0).		/// * There must be a single root loop (nesting level 0).
/// * Each loop at a given nesting level must be nested in a loop from a		/// * Each loop at a given nesting level must be nested in a loop from a
/// previous nesting level.		/// previous nesting level.
Show All 27 Lines	verifyLoopNesting(const std::vector<SmallVector<AffineForOp, 2>> &loops) {
}		}

return success();		return success();
}		}

namespace mlir {		namespace mlir {

/// External utility to vectorize affine loops in 'loops' using the n-D		/// External utility to vectorize affine loops in 'loops' using the n-D
/// vectorization factors in 'vectorSizes'. By default, each vectorization		/// vectorization factors in 'vectorSizes'. By default, each vectorization
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions Nice! This is completely unrelated to affine though and could help other places (e.g. the Linalg vectorizer). Can you please move this to a dialect-independent/std-dialect utils ? nicolasvasilache: Nice! This is completely unrelated to affine though and could help other places (e.g. the…
		sgrechanikAuthorUnsubmitted Done Reply Inline Actions Yeah, I'll try to move it into some more publicly accessible place. sgrechanik: Yeah, I'll try to move it into some more publicly accessible place.
/// factor is applied inner-to-outer to the loops of each loop nest.		/// factor is applied inner-to-outer to the loops of each loop nest.
/// 'fastestVaryingPattern' can be optionally used to provide a different loop		/// 'fastestVaryingPattern' can be optionally used to provide a different loop
/// vectorization order.		/// vectorization order.
		/// If `reductionLoops` is not empty, the given reduction loops may be
		/// vectorized along the reduction dimension.
		/// TODO: Vectorizing reductions is supported only for 1-D vectorization.
void vectorizeAffineLoops(Operation parentOp, DenseSet<Operation > &loops,		void vectorizeAffineLoops(Operation parentOp, DenseSet<Operation > &loops,
ArrayRef<int64_t> vectorSizes,		ArrayRef<int64_t> vectorSizes,
ArrayRef<int64_t> fastestVaryingPattern) {		ArrayRef<int64_t> fastestVaryingPattern,
		const ReductionLoopMap &reductionLoops) {
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.		// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
NestedPatternContext mlContext;		NestedPatternContext mlContext;
vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern);		vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern,
		reductionLoops);
}		}

/// External utility to vectorize affine loops from a single loop nest using an		/// External utility to vectorize affine loops from a single loop nest using an
/// n-D vectorization strategy (see doc in VectorizationStrategy definition).		/// n-D vectorization strategy (see doc in VectorizationStrategy definition).
/// Loops are provided in a 2D vector container. The first dimension represents		/// Loops are provided in a 2D vector container. The first dimension represents
/// the nesting level relative to the loops to be vectorized. The second		/// the nesting level relative to the loops to be vectorized. The second
/// dimension contains the loops. This means that:		/// dimension contains the loops. This means that:
/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',		/// a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',
▲ Show 20 Lines • Show All 48 Lines • Show Last 20 Lines

mlir/lib/Dialect/StandardOps/IR/Ops.cpp

Show First 20 Lines • Show All 402 Lines • ▼ Show 20 Lines	if (!op.value().getType().isa<IntegerType>())
<< "' expects an integer type";		<< "' expects an integer type";
break;		break;
default:		default:
break;		break;
}		}
return success();		return success();
}		}

		/// Returns the identity value attribute associated with an AtomicRMWKind op.
		Attribute mlir::getIdentityValueAttr(AtomicRMWKind kind, Type resultType,
		OpBuilder &builder, Location loc) {
		switch (kind) {
		case AtomicRMWKind::addf:
		case AtomicRMWKind::addi:
		return builder.getZeroAttr(resultType);
		case AtomicRMWKind::muli:
		return builder.getIntegerAttr(resultType, 1);
		case AtomicRMWKind::mulf:
		return builder.getFloatAttr(resultType, 1);
		// TODO: Add remaining reduction operations.
		default:
		(void)emitOptionalError(loc, "Reduction operation type not supported");
		break;
		}
		return nullptr;
		}

		/// Returns the identity value associated with an AtomicRMWKind op.
		Value mlir::getIdentityValue(AtomicRMWKind op, Type resultType,
		OpBuilder &builder, Location loc) {
		Attribute attr = getIdentityValueAttr(op, resultType, builder, loc);
		return builder.create<ConstantOp>(loc, attr);
		}

		/// Return the value obtained by applying the reduction operation kind
		/// associated with a binary AtomicRMWKind op to `lhs` and `rhs`.
		Value mlir::getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
		Value lhs, Value rhs) {
		switch (op) {
		case AtomicRMWKind::addf:
		return builder.create<AddFOp>(loc, lhs, rhs);
		case AtomicRMWKind::addi:
		return builder.create<AddIOp>(loc, lhs, rhs);
		case AtomicRMWKind::mulf:
		return builder.create<MulFOp>(loc, lhs, rhs);
		case AtomicRMWKind::muli:
		return builder.create<MulIOp>(loc, lhs, rhs);
		// TODO: Add remaining reduction operations.
		default:
		(void)emitOptionalError(loc, "Reduction operation type not supported");
		break;
		}
		return nullptr;
		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// GenericAtomicRMWOp		// GenericAtomicRMWOp
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

void GenericAtomicRMWOp::build(OpBuilder &builder, OperationState &result,		void GenericAtomicRMWOp::build(OpBuilder &builder, OperationState &result,
Value memref, ValueRange ivs) {		Value memref, ValueRange ivs) {
result.addOperands(memref);		result.addOperands(memref);
result.addOperands(ivs);		result.addOperands(ivs);
▲ Show 20 Lines • Show All 2,698 Lines • Show Last 20 Lines

mlir/lib/Dialect/Vector/VectorOps.cpp

	Show First 20 Lines • Show All 327 Lines • ▼ Show 20 Lines

	static void print(OpAsmPrinter &p, ReductionOp op) {			static void print(OpAsmPrinter &p, ReductionOp op) {
	p << op.getOperationName() << " \"" << op.kind() << "\", " << op.vector();			p << op.getOperationName() << " \"" << op.kind() << "\", " << op.vector();
	if (!op.acc().empty())			if (!op.acc().empty())
	p << ", " << op.acc();			p << ", " << op.acc();
	p << " : " << op.vector().getType() << " into " << op.dest().getType();			p << " : " << op.vector().getType() << " into " << op.dest().getType();
	}			}

				Value mlir::vector::getVectorReductionOp(AtomicRMWKind op, OpBuilder &builder,
				Location loc, Value vector) {
				Type scalarType = vector.getType().cast<ShapedType>().getElementType();
				switch (op) {
				case AtomicRMWKind::addf:
				case AtomicRMWKind::addi:
				return builder.create<vector::ReductionOp>(vector.getLoc(), scalarType,
				builder.getStringAttr("add"),
				vector, ValueRange{});
				case AtomicRMWKind::mulf:
				case AtomicRMWKind::muli:
				return builder.create<vector::ReductionOp>(vector.getLoc(), scalarType,
				builder.getStringAttr("mul"),
				vector, ValueRange{});
				// TODO: Add remaining reduction operations.
				default:
				(void)emitOptionalError(loc, "Reduction operation type not supported");
				break;
				}
				return nullptr;
				}

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// ContractionOp			// ContractionOp
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	void vector::ContractionOp::build(OpBuilder &builder, OperationState &result,			void vector::ContractionOp::build(OpBuilder &builder, OperationState &result,
	Value lhs, Value rhs, Value acc,			Value lhs, Value rhs, Value acc,
	ArrayRef<ArrayRef<AffineExpr>> indexingExprs,			ArrayRef<ArrayRef<AffineExpr>> indexingExprs,
	ArrayRef<StringRef> iteratorTypes) {			ArrayRef<StringRef> iteratorTypes) {
	▲ Show 20 Lines • Show All 3,499 Lines • Show Last 20 Lines

mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir

	Show First 20 Lines • Show All 584 Lines • ▼ Show 20 Lines
	// CHECK: affine.for %{{.*}} = 0 to 256 step 128 {			// CHECK: affine.for %{{.*}} = 0 to 256 step 128 {
	// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>			// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
	// CHECK: %[[last_val:.]] = affine.for %{{.}} = 0 to 128 iter_args(%[[last_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {			// CHECK: %[[last_val:.]] = affine.for %{{.}} = 0 to 128 iter_args(%[[last_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
	// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<128x256xf32>, vector<128xf32>			// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<128x256xf32>, vector<128xf32>
	// CHECK: affine.yield %[[ld]] : vector<128xf32>			// CHECK: affine.yield %[[ld]] : vector<128xf32>
	// CHECK: }			// CHECK: }
	// CHECK: vector.transfer_write %[[last_val]], %{{.*}} : vector<128xf32>, memref<256xf32>			// CHECK: vector.transfer_write %[[last_val]], %{{.*}} : vector<128xf32>, memref<256xf32>
	// CHECK: }			// CHECK: }

				// -----

				// The inner reduction loop '%j' is not vectorized if we do not request
				// reduction vectorization.

				func @vec_vecdim_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %red_iter, %ld : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @vec_vecdim_reduction_rejected
				// CHECK-NOT: vector

mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir

This file was added.

				// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0 vectorize-reductions=true" -split-input-file \| FileCheck %s

				// The inner reduction loop '%j' is vectorized.

				func @vecdim_reduction(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %red_iter, %ld : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @vecdim_reduction
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[vred:.]] = affine.for %{{.}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
				// CHECK: affine.yield %[[add]] : vector<128xf32>
				// CHECK: }
				// CHECK: %[[final_sum:.]] = vector.reduction "add", %[[vred:.]] : vector<128xf32> into f32
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
				// CHECK: }

				// -----

				// The inner reduction loop '%j' is vectorized. (The order of addf's operands is
				// different than in the previous test case).

				func @vecdim_reduction_comm(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %ld, %red_iter : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @vecdim_reduction_comm
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[vred:.]] = affine.for %{{.}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[ld]], %[[red_iter]] : vector<128xf32>
				// CHECK: affine.yield %[[add]] : vector<128xf32>
				// CHECK: }
				// CHECK: %[[final_sum:.]] = vector.reduction "add", %[[vred:.]] : vector<128xf32> into f32
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
				// CHECK: }

				// -----

				// The inner reduction loop '%j' is vectorized. Transforming the input before
				// performing the accumulation doesn't cause any problem.

				func @vecdim_reduction_expsin(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%sin = math.sin %ld : f32
				%exp = math.exp %sin : f32
				%add = addf %red_iter, %exp : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @vecdim_reduction_expsin
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[vred:.]] = affine.for %{{.}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[sin:.*]] = math.sin %[[ld]]
				// CHECK: %[[exp:.*]] = math.exp %[[sin]]
				// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[exp]] : vector<128xf32>
				// CHECK: affine.yield %[[add]] : vector<128xf32>
				// CHECK: }
				// CHECK: %[[final_sum:.]] = vector.reduction "add", %[[vred:.]] : vector<128xf32> into f32
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
				// CHECK: }

				// -----

				// Two reductions at the same time. The inner reduction loop '%j' is vectorized.

				func @two_vecdim_reductions(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>) {
				%cst = constant 1.000000e+00 : f32
				affine.for %i = 0 to 256 {
				// Note that we pass the same constant '1.0' as initial values for both
				// reductions.
				%sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %part_sum, %ld : f32
				%mul = mulf %part_prod, %ld : f32
				affine.yield %add, %mul : f32, f32
				}
				affine.store %sum, %out_sum[%i] : memref<256xf32>
				affine.store %prod, %out_prod[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @two_vecdim_reductions
				// CHECK: %[[cst:.*]] = constant 1.000000e+00 : f32
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[vone:.*]] = constant dense<1.000000e+00> : vector<128xf32>
				// CHECK: %[[vred:.]]:2 = affine.for %{{.}} = 0 to 512 step 128 iter_args(%[[part_sum:.]] = %[[vzero]], %[[part_prod:.]] = %[[vone]]) -> (vector<128xf32>, vector<128xf32>) {
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[part_sum]], %[[ld]] : vector<128xf32>
				// CHECK: %[[mul:.*]] = mulf %[[part_prod]], %[[ld]] : vector<128xf32>
				// CHECK: affine.yield %[[add]], %[[mul]] : vector<128xf32>, vector<128xf32>
				// CHECK: }
				// CHECK: %[[nonfinal_sum:.]] = vector.reduction "add", %[[vred:.]]#0 : vector<128xf32> into f32
				// Note that to compute the final sum we need to add the original initial value
				// (%cst) since it is not zero.
				// CHECK: %[[final_sum:.*]] = addf %[[nonfinal_sum]], %[[cst]] : f32
				// For the final product we don't need to do this additional step because the
				// initial value equals to 1 (the neutral element for multiplication).
				// CHECK: %[[final_prod:.]] = vector.reduction "mul", %[[vred:.]]#1 : vector<128xf32> into f32
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
				// CHECK: affine.store %[[final_prod]], %{{.*}} : memref<256xf32>
				// CHECK: }

				// -----

				// The integer case.

				func @two_vecdim_reductions_int(%in: memref<256x512xi64>, %out_sum: memref<256xi64>, %out_prod: memref<256xi64>) {
				%cst0 = constant 0 : i64
				%cst1 = constant 1 : i64
				affine.for %i = 0 to 256 {
				%sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst0, %part_prod = %cst1) -> (i64, i64) {
				%ld = affine.load %in[%i, %j] : memref<256x512xi64>
				%add = addi %part_sum, %ld : i64
				%mul = muli %part_prod, %ld : i64
				affine.yield %add, %mul : i64, i64
				}
				affine.store %sum, %out_sum[%i] : memref<256xi64>
				affine.store %prod, %out_prod[%i] : memref<256xi64>
				}
				return
				}

				// CHECK-LABEL: @two_vecdim_reductions
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %[[vzero:.*]] = constant dense<0> : vector<128xi64>
				// CHECK: %[[vone:.*]] = constant dense<1> : vector<128xi64>
				// CHECK: %[[vred:.]]:2 = affine.for %{{.}} = 0 to 512 step 128 iter_args(%[[part_sum:.]] = %[[vzero]], %[[part_prod:.]] = %[[vone]]) -> (vector<128xi64>, vector<128xi64>) {
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xi64>, vector<128xi64>
				// CHECK: %[[add:.*]] = addi %[[part_sum]], %[[ld]] : vector<128xi64>
				// CHECK: %[[mul:.*]] = muli %[[part_prod]], %[[ld]] : vector<128xi64>
				// CHECK: affine.yield %[[add]], %[[mul]] : vector<128xi64>, vector<128xi64>
				// CHECK: }
				// CHECK: %[[final_sum:.]] = vector.reduction "add", %[[vred:.]]#0 : vector<128xi64> into i64
				// CHECK: %[[final_prod:.]] = vector.reduction "mul", %[[vred:.]]#1 : vector<128xi64> into i64
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xi64>
				// CHECK: affine.store %[[final_prod]], %{{.*}} : memref<256xi64>
				// CHECK: }

				// -----

				// The outer reduction loop '%j' is vectorized.

				func @vecdim_reduction_nested(%in: memref<256x512xf32>, %out: memref<1xf32>) {
				%cst = constant 0.000000e+00 : f32
				%outer_red = affine.for %j = 0 to 512 iter_args(%outer_iter = %cst) -> (f32) {
				%inner_red = affine.for %i = 0 to 256 iter_args(%inner_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %inner_iter, %ld : f32
				affine.yield %add : f32
				}
				%outer_add = addf %outer_iter, %inner_red : f32
				affine.yield %outer_add : f32
				}
				affine.store %outer_red, %out[0] : memref<1xf32>
				return
				}

				// CHECK-LABEL: @vecdim_reduction_nested
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[outer_red:.]] = affine.for %{{.}} = 0 to 512 step 128 iter_args(%[[outer_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[inner_red:.]] = affine.for %{{.}} = 0 to 256 iter_args(%[[inner_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[inner_iter]], %[[ld]] : vector<128xf32>
				// CHECK: affine.yield %[[add]] : vector<128xf32>
				// CHECK: }
				// CHECK: %[[outer_add:.*]] = addf %[[outer_iter]], %[[inner_red]] : vector<128xf32>
				// CHECK: affine.yield %[[outer_add]] : vector<128xf32>
				// CHECK: }
				// CHECK: %[[final_sum:.]] = vector.reduction "add", %[[outer_red:.]] : vector<128xf32> into f32
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<1xf32>

				// -----

				// The inner reduction loop '%j' computes partial sums as a side effect and
				// is not vectorized.

				func @vecdim_partial_sums_1_rejected(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>, %out_partsum: memref<256x512xf32>) {
				%cst = constant 1.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %part_sum, %ld : f32
				%mul = mulf %part_prod, %ld : f32
				affine.store %add, %out_partsum[%i, %j] : memref<256x512xf32>
				affine.yield %add, %mul : f32, f32
				}
				affine.store %sum, %out_sum[%i] : memref<256xf32>
				affine.store %prod, %out_prod[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @vecdim_partial_sums_1_rejected
				// CHECK-NOT: vector

				// -----

				// The inner reduction loop '%j' computes partial sums as a side effect and
				// is not vectorized.

				func @vecdim_partial_sums_2_rejected(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>, %out_partsum: memref<256x512xf32>) {
				%cst = constant 1.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
				affine.store %part_sum, %out_partsum[%i, %j] : memref<256x512xf32>
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %part_sum, %ld : f32
				%mul = mulf %part_prod, %ld : f32
				affine.yield %add, %mul : f32, f32
				}
				affine.store %sum, %out_sum[%i] : memref<256xf32>
				affine.store %prod, %out_prod[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @vecdim_partial_sums_2_rejected
				// CHECK-NOT: vector

				// -----

				// The inner reduction loop '%j' performs an unknown reduction operation and is
				// not vectorized.

				func @vecdim_unknown_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 1.000000e+00 : f32
				%final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
				%add = addf %red_iter, %red_iter : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[0] : memref<256xf32>
				return
				}

				// CHECK-LABEL: @vecdim_unknown_reduction_rejected
				// CHECK-NOT: vector

				// -----

				// The inner reduction loop '%j' doesn't perform any operation which is not
				// recognized as a standard reduction.

				func @vecdim_none_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 1.000000e+00 : f32
				%final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
				affine.yield %red_iter : f32
				}
				affine.store %final_red, %out[0] : memref<256xf32>
				return
				}

				// CHECK-LABEL: @vecdim_none_reduction_rejected
				// CHECK-NOT: vector

				// -----

				// The number of iterations is not divisable by the vector size, so a mask has
				// to be applied to the last update of the accumulator.

				func @vecdim_reduction_masked(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 0 to 500 iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %red_iter, %ld : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK: #[[$map0:.]] = affine_map<([[d0:.]]) -> (-[[d0]] + 500)>
				// CHECK-LABEL: @vecdim_reduction_masked
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[vred:.]] = affine.for %[[iv:.]] = 0 to 500 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
				// CHECK: %[[elems_left:.*]] = affine.apply #[[$map0]](%[[iv]])
				// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
				// CHECK: %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
				// CHECK: affine.yield %[[new_acc]] : vector<128xf32>
				// CHECK: }
				// CHECK: %[[final_sum:.]] = vector.reduction "add", %[[vred:.]] : vector<128xf32> into f32
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
				// CHECK: }

				// -----

				// The number of iteration is not known, so a mask has to be applied.

				func @vecdim_reduction_masked_unknown_ub(%in: memref<256x512xf32>, %out: memref<256xf32>, %bnd: index) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 0 to %bnd iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %red_iter, %ld : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK: #[[$map1:.]] = affine_map<([[d0:.]]){{\[}}[[s0:.*]]{{\]}} -> (-[[d0]] + [[s0]])>
				// CHECK-LABEL: @vecdim_reduction_masked_unknown_ub
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
				// CHECK: %[[vred:.]] = affine.for %[[iv:.]] = 0 to %[[bnd:.]] step 128 iter_args(%[[red_iter:.]] = %[[vzero]]) -> (vector<128xf32>) {
				// CHECK: %[[elems_left:.*]] = affine.apply #[[$map1]](%[[iv]])[%[[bnd]]]
				// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
				// CHECK: %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
				// CHECK: affine.yield %[[new_acc]] : vector<128xf32>
				// CHECK: }
				// CHECK: %[[final_sum:.]] = vector.reduction "add", %[[vred:.]] : vector<128xf32> into f32
				// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
				// CHECK: }

				// -----

				// The lower bound is nonzero, but the number of iterations is divisible by the
				// vector size, so masking is not needed.

				func @vecdim_reduction_nonzero_lb(%in: memref<256x512xf32>, %out: memref<256xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 127 to 511 iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %red_iter, %ld : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK-LABEL: @vecdim_reduction_nonzero_lb
				// CHECK: %{{.}} = affine.for %{{.}} = 127 to 511 step 128 iter_args({{.*}}) -> (vector<128xf32>) {
				// CHECK-NOT: vector.create_mask

				// -----

				// The lower bound is unknown, so we need to create a mask.

				func @vecdim_reduction_masked_unknown_lb(%in: memref<256x512xf32>, %out: memref<256xf32>, %lb: index) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = %lb to 512 iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %red_iter, %ld : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK: #[[$map2:.]] = affine_map<([[d0:.]]) -> (-[[d0]] + 512)>
				// CHECK-LABEL: @vecdim_reduction_masked_unknown_lb
				// CHECK: %{{.}} = affine.for %[[iv:.]] = %[[lb:.]] to 512 step 128 iter_args(%[[red_iter:.]] = {{.*}}) -> (vector<128xf32>) {
				// CHECK: %[[elems_left:.*]] = affine.apply #[[$map2]](%[[iv]])
				// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
				// CHECK: %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
				// CHECK: affine.yield %[[new_acc]] : vector<128xf32>

				// -----

				// The upper bound is a minimum expression.

				func @vecdim_reduction_complex_ub(%in: memref<256x512xf32>, %out: memref<256xf32>, %M: index, %N: index) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_red = affine.for %j = 0 to min affine_map<(d0, d1) -> (d0, d1*2)>(%M, %N) iter_args(%red_iter = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%add = addf %red_iter, %ld : f32
				affine.yield %add : f32
				}
				affine.store %final_red, %out[%i] : memref<256xf32>
				}
				return
				}

				// CHECK: #[[$map3:.]] = affine_map<([[d0:.]], [[d1:.]]) -> ([[d0]], [[d1]] 2)>
				// CHECK: #[[$map3_sub:.]] = affine_map<([[d0:.]], [[d1:.*]]) -> ([[d0]] - [[d1]])>
				// CHECK-LABEL: @vecdim_reduction_complex_ub
				// CHECK: %{{.}} = affine.for %[[iv:.]] = 0 to min #[[$map3]](%[[M:.]], %[[N:.]]) step 128 iter_args(%[[red_iter:.]] = {{.}}) -> (vector<128xf32>) {
				// CHECK: %[[ub:.*]] = affine.min #[[$map3]](%[[M]], %[[N]])
				// CHECK: %[[elems_left:.*]] = affine.apply #[[$map3_sub]](%[[ub]], %[[iv]])
				// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
				// CHECK: %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
				// CHECK: affine.yield %[[new_acc]] : vector<128xf32>

				// -----

				// The same mask is applied to both reductions.

				func @vecdim_two_reductions_masked(%in: memref<256x512xf32>, %out: memref<512xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%final_sum, %final_expsum = affine.for %j = 0 to 500 iter_args(%sum_iter = %cst, %expsum_iter = %cst) -> (f32, f32) {
				%ld = affine.load %in[%i, %j] : memref<256x512xf32>
				%exp = math.exp %ld : f32
				%add = addf %sum_iter, %ld : f32
				%eadd = addf %expsum_iter, %exp : f32
				affine.yield %add, %eadd : f32, f32
				}
				affine.store %final_sum, %out[2*%i] : memref<512xf32>
				affine.store %final_expsum, %out[2*%i + 1] : memref<512xf32>
				}
				return
				}

				// CHECK: #[[$map4:.]] = affine_map<([[d0:.]]) -> (-[[d0]] + 500)>
				// CHECK-LABEL: @vecdim_two_reductions_masked
				// CHECK: affine.for %{{.*}} = 0 to 256 {
				// CHECK: %{{.}} = affine.for %[[iv:.]] = 0 to 500 step 128 iter_args(%[[sum_iter:.]] = {{.}}, %[[esum_iter:.]] = {{.}}) -> (vector<128xf32>, vector<128xf32>) {
				// CHECK: %[[elems_left:.*]] = affine.apply #[[$map4]](%[[iv]])
				// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
				// CHECK: %[[ld:.]] = vector.transfer_read %{{.}} : memref<256x512xf32>, vector<128xf32>
				// CHECK: %[[exp:.*]] = math.exp %[[ld]] : vector<128xf32>
				// CHECK: %[[add:.*]] = addf %[[sum_iter]], %[[ld]] : vector<128xf32>
				// CHECK: %[[eadd:.*]] = addf %[[esum_iter]], %[[exp]] : vector<128xf32>
				// CHECK: %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[sum_iter]] : vector<128xi1>, vector<128xf32>
				// CHECK: %[[new_eacc:.*]] = select %[[mask]], %[[eadd]], %[[esum_iter]] : vector<128xi1>, vector<128xf32>
				// CHECK: affine.yield %[[new_acc]], %[[new_eacc]] : vector<128xf32>
				// CHECK: }

mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction_2d.mlir

This file was added.

				// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=32,256 test-fastest-varying=1,0 vectorize-reductions=true" -verify-diagnostics

				// TODO: Vectorization of reduction loops along the reduction dimension is not
				// supported for higher-rank vectors yet, so we are just checking that an
				// error message is produced.

				// expected-error@+1 {{Vectorizing reductions is supported only for 1-D vectors}}
				func @vecdim_reduction_2d(%in: memref<256x512x1024xf32>, %out: memref<256xf32>) {
				%cst = constant 0.000000e+00 : f32
				affine.for %i = 0 to 256 {
				%sum_j = affine.for %j = 0 to 512 iter_args(%red_iter_j = %cst) -> (f32) {
				%sum_k = affine.for %k = 0 to 1024 iter_args(%red_iter_k = %cst) -> (f32) {
				%ld = affine.load %in[%i, %j, %k] : memref<256x512x1024xf32>
				%add = addf %red_iter_k, %ld : f32
				affine.yield %add : f32
				}
				%add = addf %red_iter_j, %sum_k : f32
				affine.yield %add : f32
				}
				affine.store %sum_j, %out[%i] : memref<256xf32>
				}
				return
				}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][Affine][Vector] Support vectorizing reduction loops
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 343080

mlir/include/mlir/Analysis/AffineAnalysis.h

mlir/include/mlir/Dialect/Affine/Passes.td

mlir/include/mlir/Dialect/Affine/Utils.h

mlir/include/mlir/Dialect/StandardOps/IR/Ops.h

mlir/include/mlir/Dialect/Vector/VectorOps.h

mlir/lib/Analysis/AffineAnalysis.cpp

mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp

mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp

mlir/lib/Dialect/StandardOps/IR/Ops.cpp

mlir/lib/Dialect/Vector/VectorOps.cpp

mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir

mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir

mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction_2d.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][Affine][Vector] Support vectorizing reduction loopsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 343080

mlir/include/mlir/Analysis/AffineAnalysis.h

mlir/include/mlir/Dialect/Affine/Passes.td

mlir/include/mlir/Dialect/Affine/Utils.h

mlir/include/mlir/Dialect/StandardOps/IR/Ops.h

mlir/include/mlir/Dialect/Vector/VectorOps.h

mlir/lib/Analysis/AffineAnalysis.cpp

mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp

mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp

mlir/lib/Dialect/StandardOps/IR/Ops.cpp

mlir/lib/Dialect/Vector/VectorOps.cpp

mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir

mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir

mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction_2d.mlir

[mlir][Affine][Vector] Support vectorizing reduction loops
ClosedPublic