Index: mlir/include/mlir/Analysis/AffineAnalysis.h
===================================================================
--- mlir/include/mlir/Analysis/AffineAnalysis.h
+++ mlir/include/mlir/Analysis/AffineAnalysis.h
@@ -47,6 +47,11 @@
     AffineForOp forOp,
     SmallVectorImpl<LoopReduction> *parallelReductions = nullptr);
 
+/// Returns true if `forOp' doesn't have memory dependences preventing
+/// parallelization. This function doesn't check iter_args and should be used
+/// only as a building block for full parallel-checking functions.
+bool isLoopMemoryParallel(AffineForOp forOp);
+
 /// Returns in `affineApplyOps`, the sequence of those AffineApplyOp
 /// Operations that are reachable via a search starting from `operands` and
 /// ending at those operands that are not the result of an AffineApplyOp.
Index: mlir/include/mlir/Dialect/Affine/Passes.td
===================================================================
--- mlir/include/mlir/Dialect/Affine/Passes.td
+++ mlir/include/mlir/Dialect/Affine/Passes.td
@@ -112,7 +112,11 @@
                "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory "
                "dimensions to match. See defaultPatterns in Vectorize.cpp for "
                "a description and examples. This is used for testing purposes",
-               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
+               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
+    Option<"vectorizeReductions", "vectorize-reductions", "bool",
+           /*default=*/"false",
+           "Vectorize known reductions expressed via iter_args. "
+           "Switched off by default.">
   ];
 }
 
Index: mlir/include/mlir/Dialect/Affine/Utils.h
===================================================================
--- mlir/include/mlir/Dialect/Affine/Utils.h
+++ mlir/include/mlir/Dialect/Affine/Utils.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_DIALECT_AFFINE_UTILS_H
 #define MLIR_DIALECT_AFFINE_UTILS_H
 
+#include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/DenseMap.h"
@@ -27,6 +28,8 @@
 struct LoopReduction;
 class Operation;
 
+using ReductionLoopMap = DenseMap<Operation *, SmallVector<LoopReduction, 2>>;
+
 /// Replaces parallel affine.for op with 1-d affine.parallel op.
 /// mlir::isLoopParallel detects the parallel affine.for ops.
 /// Parallelizes the specified reductions. Parallelization will fail in presence
@@ -81,16 +84,23 @@
   // The candidate will be vectorized using the vectorization factor in
   // 'vectorSizes' for that dimension.
   DenseMap<Operation *, unsigned> loopToVectorDim;
+  // Maps loops that implement vectorizable reductions to the corresponding
+  // reduction descriptors.
+  ReductionLoopMap reductionLoops;
 };
 
 /// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
 /// 'vectorSizes'. By default, each vectorization factor is applied
 /// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
 /// be optionally used to provide a different loop vectorization order.
+/// If `reductionLoops` is not empty, the given reduction loops may be
+/// vectorized along the reduction dimension.
+/// TODO: Vectorizing reductions is supported only for 1-D vectorization.
 void vectorizeAffineLoops(
     Operation *parentOp,
     llvm::DenseSet<Operation *, DenseMapInfo<Operation *>> &loops,
-    ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern);
+    ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern,
+    const ReductionLoopMap &reductionLoops = ReductionLoopMap());
 
 /// External utility to vectorize affine loops from a single loop nest using an
 /// n-D vectorization strategy (see doc in VectorizationStrategy definition).
Index: mlir/include/mlir/Dialect/StandardOps/IR/Ops.h
===================================================================
--- mlir/include/mlir/Dialect/StandardOps/IR/Ops.h
+++ mlir/include/mlir/Dialect/StandardOps/IR/Ops.h
@@ -124,6 +124,20 @@
 /// Ignore integer bitwitdh and type mismatch that come from the fact there is
 /// no IndexAttr and that IndexType have no bitwidth.
 bool isEqualConstantIntOrValue(OpFoldResult ofr1, OpFoldResult ofr2);
+
+/// Returns the identity value attribute associated with an AtomicRMWKind op.
+Attribute getIdentityValueAttr(AtomicRMWKind kind, Type resultType,
+                               OpBuilder &builder, Location loc);
+
+/// Returns the identity value associated with an AtomicRMWKind op.
+Value getIdentityValue(AtomicRMWKind op, Type resultType, OpBuilder &builder,
+                       Location loc);
+
+/// Returns the value obtained by applying the reduction operation kind
+/// associated with a binary AtomicRMWKind op to `lhs` and `rhs`.
+Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
+                     Value lhs, Value rhs);
+
 } // end namespace mlir
 
 #endif // MLIR_DIALECT_IR_STANDARDOPS_IR_OPS_H
Index: mlir/include/mlir/Dialect/Vector/VectorOps.h
===================================================================
--- mlir/include/mlir/Dialect/Vector/VectorOps.h
+++ mlir/include/mlir/Dialect/Vector/VectorOps.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_DIALECT_VECTOR_VECTOROPS_H
 #define MLIR_DIALECT_VECTOR_VECTOROPS_H
 
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -184,6 +185,11 @@
 /// the integer type required for subscripts in the vector dialect.
 ArrayAttr getVectorSubscriptAttr(Builder &b, ArrayRef<int64_t> values);
 
+/// Returns the value obtained by reducing the vector into a scalar using the
+/// operation kind associated with a binary AtomicRMWKind op.
+Value getVectorReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
+                           Value vector);
+
 namespace impl {
 /// Build the default minor identity map suitable for a vector transfer. This
 /// also handles the case memref<... x vector<...>> -> vector<...> in which the
Index: mlir/lib/Analysis/AffineAnalysis.cpp
===================================================================
--- mlir/lib/Analysis/AffineAnalysis.cpp
+++ mlir/lib/Analysis/AffineAnalysis.cpp
@@ -70,6 +70,8 @@
     return nullptr;
   if (!forOp.getRegionIterArgs()[pos].hasOneUse())
     return nullptr;
+  if (!yielded.hasOneUse())
+    return nullptr;
 
   Optional<AtomicRMWKind> maybeKind =
       TypeSwitch<Operation *, Optional<AtomicRMWKind>>(definition)
@@ -123,6 +125,14 @@
       return false;
   }
 
+  // Check memory dependences.
+  return isLoopMemoryParallel(forOp);
+}
+
+/// Returns true if `forOp' doesn't have memory dependences preventing
+/// parallelization. This function doesn't check iter_args and should be used
+/// only as a building block for full parallel-checking functions.
+bool mlir::isLoopMemoryParallel(AffineForOp forOp) {
   // Collect all load and store ops in loop nest rooted at 'forOp'.
   SmallVector<Operation *, 8> loadAndStoreOps;
   auto walkResult = forOp.walk([&](Operation *op) -> WalkResult {
Index: mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
===================================================================
--- mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
+++ mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
@@ -367,49 +367,6 @@
   }
 };
 
-/// Returns the identity value associated with an AtomicRMWKind op.
-static Value getIdentityValue(AtomicRMWKind op, Type resultType,
-                              OpBuilder &builder, Location loc) {
-  switch (op) {
-  case AtomicRMWKind::addf:
-    return builder.create<ConstantOp>(loc, builder.getFloatAttr(resultType, 0));
-  case AtomicRMWKind::addi:
-    return builder.create<ConstantOp>(loc,
-                                      builder.getIntegerAttr(resultType, 0));
-  case AtomicRMWKind::mulf:
-    return builder.create<ConstantOp>(loc, builder.getFloatAttr(resultType, 1));
-  case AtomicRMWKind::muli:
-    return builder.create<ConstantOp>(loc,
-                                      builder.getIntegerAttr(resultType, 1));
-  // TODO: Add remaining reduction operations.
-  default:
-    (void)emitOptionalError(loc, "Reduction operation type not supported");
-    break;
-  }
-  return nullptr;
-}
-
-/// Return the value obtained by applying the reduction operation kind
-/// associated with a binary AtomicRMWKind op to `lhs` and `rhs`.
-static Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
-                            Value lhs, Value rhs) {
-  switch (op) {
-  case AtomicRMWKind::addf:
-    return builder.create<AddFOp>(loc, lhs, rhs);
-  case AtomicRMWKind::addi:
-    return builder.create<AddIOp>(loc, lhs, rhs);
-  case AtomicRMWKind::mulf:
-    return builder.create<MulFOp>(loc, lhs, rhs);
-  case AtomicRMWKind::muli:
-    return builder.create<MulIOp>(loc, lhs, rhs);
-  // TODO: Add remaining reduction operations.
-  default:
-    (void)emitOptionalError(loc, "Reduction operation type not supported");
-    break;
-  }
-  return nullptr;
-}
-
 /// Convert an `affine.parallel` (loop nest) operation into a `scf.parallel`
 /// operation.
 class AffineParallelLowering : public OpRewritePattern<AffineParallelOp> {
Index: mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
===================================================================
--- mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -21,6 +21,7 @@
 #include "mlir/Dialect/Vector/VectorUtils.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/Support/LLVM.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 
 using namespace mlir;
@@ -324,7 +325,7 @@
 /// Unsupported cases, extensions, and work in progress (help welcome :-) ):
 /// ========================================================================
 ///   1. lowering to concrete vector types for various HW;
-///   2. reduction support;
+///   2. reduction support for n-D vectorization and non-unit steps;
 ///   3. non-effecting padding during vector.transfer_read and filter during
 ///      vector.transfer_write;
 ///   4. misalignment support vector.transfer_read / vector.transfer_write
@@ -487,6 +488,73 @@
 ///
 /// Of course, much more intricate n-D imperfectly-nested patterns can be
 /// vectorized too and specified in a fully declarative fashion.
+///
+/// Reduction:
+/// ==========
+/// Vectorizing reduction loops along the reduction dimension is supported if:
+/// - the reduction kind is supported,
+/// - the vectorization is 1-D, and
+/// - the step size of the loop equals to one.
+///
+/// Comparing to the non-vector-dimension case, two additional things are done
+/// during vectorization of such loops:
+/// - The resulting vector returned from the loop is reduced to a scalar using
+///   `vector.reduce`.
+/// - In some cases a mask is applied to the vector yielded at the end of the
+///   loop to prevent garbage values from being written to the accumulator.
+///
+/// Reduction vectorization is switched off by default, it can be enabled by
+/// passing a map from loops to reductions to utility functions, or by passing
+/// `vectorize-reductions=true` to the vectorization pass.
+///
+/// Consider the following example:
+/// ```mlir
+/// func @vecred(%in: memref<512xf32>) -> f32 {
+///   %cst = constant 0.000000e+00 : f32
+///   %sum = affine.for %i = 0 to 500 iter_args(%part_sum = %cst) -> (f32) {
+///     %ld = affine.load %in[%i] : memref<512xf32>
+///     %cos = math.cos %ld : f32
+///     %add = addf %part_sum, %cos : f32
+///     affine.yield %add : f32
+///   }
+///   return %sum : f32
+/// }
+/// ```
+///
+/// The -affine-vectorize pass with the following arguments:
+/// ```
+/// -affine-vectorize="virtual-vector-size=128 test-fastest-varying=0 \
+///                    vectorize-reductions=true"
+/// ```
+/// produces the following output:
+/// ```mlir
+/// #map = affine_map<(d0) -> (-d0 + 500)>
+/// func @vecred(%arg0: memref<512xf32>) -> f32 {
+///   %cst = constant 0.000000e+00 : f32
+///   %cst_0 = constant dense<0.000000e+00> : vector<128xf32>
+///   %0 = affine.for %arg1 = 0 to 500 step 128 iter_args(%arg2 = %cst_0)
+///           -> (vector<128xf32>) {
+///     // %2 is the number of iterations left in the original loop.
+///     %2 = affine.apply #map(%arg1)
+///     %3 = vector.create_mask %2 : vector<128xi1>
+///     %cst_1 = constant 0.000000e+00 : f32
+///     %4 = vector.transfer_read %arg0[%arg1], %cst_1 :
+///                     memref<512xf32>, vector<128xf32>
+///     %5 = math.cos %4 : vector<128xf32>
+///     %6 = addf %arg2, %5 : vector<128xf32>
+///     // We filter out the effect of last 12 elements using the mask.
+///     %7 = select %3, %6, %arg2 : vector<128xi1>, vector<128xf32>
+///     affine.yield %7 : vector<128xf32>
+///   }
+///   %1 = vector.reduction "add", %0 : vector<128xf32> into f32
+///   return %1 : f32
+/// }
+/// ```
+///
+/// Note that because of loop misalignment we needed to apply a mask to prevent
+/// last 12 elements from affecting the final result. The mask is full of ones
+/// in every iteration except for the last one, in which it has the form
+/// `11...100...0` with 116 ones and 12 zeros.
 
 #define DEBUG_TYPE "early-vect"
 
@@ -644,6 +712,17 @@
   void registerValueScalarReplacement(BlockArgument replaced,
                                       BlockArgument replacement);
 
+  /// Registers the scalar replacement of a scalar result returned from a
+  /// reduction loop. 'replacement' must be scalar.
+  ///
+  /// This utility is used to register the replacement for scalar results of
+  /// vectorized reduction loops with iter_args.
+  ///
+  /// Example 2:
+  ///   * 'replaced': %0 = affine.for %i = 0 to 512 iter_args(%x = ...) -> (f32)
+  ///   * 'replacement': %1 = vector.reduction "add" %0 : vector<4xf32> into f32
+  void registerLoopResultScalarReplacement(Value replaced, Value replacement);
+
   /// Returns in 'replacedVals' the scalar replacement for values in
   /// 'inputVals'.
   void getScalarValueReplacementsFor(ValueRange inputVals,
@@ -663,10 +742,16 @@
   // Maps input scalar values to their new scalar counterparts in the vector
   // loop nest.
   BlockAndValueMapping valueScalarReplacement;
+  // Maps results of reduction loops to their new scalar counterparts.
+  DenseMap<Value, Value> loopResultScalarReplacement;
 
   // Maps the newly created vector loops to their vector dimension.
   DenseMap<Operation *, unsigned> vecLoopToVecDim;
 
+  // Maps the new vectorized loops to the corresponding vector masks if it is
+  // required.
+  DenseMap<Operation *, Value> vecLoopToMask;
+
   // The strategy drives which loop to vectorize by which amount.
   const VectorizationStrategy *strategy;
 
@@ -761,6 +846,26 @@
   registerValueScalarReplacementImpl(replaced, replacement);
 }
 
+/// Registers the scalar replacement of a scalar result returned from a
+/// reduction loop. 'replacement' must be scalar.
+///
+/// This utility is used to register the replacement for scalar results of
+/// vectorized reduction loops with iter_args.
+///
+/// Example 2:
+///   * 'replaced': %0 = affine.for %i = 0 to 512 iter_args(%x = ...) -> (f32)
+///   * 'replacement': %1 = vector.reduction "add" %0 : vector<4xf32> into f32
+void VectorizationState::registerLoopResultScalarReplacement(
+    Value replaced, Value replacement) {
+  assert(isa<AffineForOp>(replaced.getDefiningOp()));
+  assert(loopResultScalarReplacement.count(replaced) == 0 &&
+         "already registered");
+  LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ will replace a result of the loop "
+                       "with scalar: "
+                    << replacement);
+  loopResultScalarReplacement[replaced] = replacement;
+}
+
 void VectorizationState::registerValueScalarReplacementImpl(Value replaced,
                                                             Value replacement) {
   assert(!valueScalarReplacement.contains(replaced) &&
@@ -850,6 +955,101 @@
   return newConstOp;
 }
 
+/// Creates a constant vector filled with the neutral elements of the given
+/// reduction. The scalar type of vector elements will be taken from
+/// `oldOperand`.
+static ConstantOp createInitialVector(AtomicRMWKind reductionKind,
+                                      Value oldOperand,
+                                      VectorizationState &state) {
+  Type scalarTy = oldOperand.getType();
+  if (!VectorType::isValidElementType(scalarTy))
+    return nullptr;
+
+  Attribute valueAttr = getIdentityValueAttr(
+      reductionKind, scalarTy, state.builder, oldOperand.getLoc());
+  auto vecTy = getVectorType(scalarTy, state.strategy);
+  auto vecAttr = DenseElementsAttr::get(vecTy, valueAttr);
+  auto newConstOp =
+      state.builder.create<ConstantOp>(oldOperand.getLoc(), vecAttr);
+
+  return newConstOp;
+}
+
+/// Creates a mask used to filter out garbage elements in the last iteration
+/// of unaligned loops. If a mask is not required then `nullptr` is returned.
+/// The mask will be a vector of booleans representing meaningful vector
+/// elements in the current iteration. It is filled with ones for each iteration
+/// except for the last one, where it has the form `11...100...0` with the
+/// number of ones equal to the number of meaningful elements (i.e. the number
+/// of iterations that would be left in the original loop).
+static Value createMask(AffineForOp vecForOp, VectorizationState &state) {
+  assert(state.strategy->vectorSizes.size() == 1 &&
+         "Creating a mask non-1-D vectors is not supported.");
+  assert(vecForOp.getStep() == state.strategy->vectorSizes[0] &&
+         "Creating a mask for loops with non-unit original step size is not "
+         "supported.");
+
+  // Check if we have already created the mask.
+  if (Value mask = state.vecLoopToMask.lookup(vecForOp))
+    return mask;
+
+  // If the loop has constant bounds and the original number of iterations is
+  // divisable by the vector size then we don't need a mask.
+  if (vecForOp.hasConstantBounds()) {
+    int64_t originalTripCount =
+        vecForOp.getConstantUpperBound() - vecForOp.getConstantLowerBound();
+    if (originalTripCount % vecForOp.getStep() == 0)
+      return nullptr;
+  }
+
+  OpBuilder::InsertionGuard guard(state.builder);
+  state.builder.setInsertionPointToStart(vecForOp.getBody());
+
+  // We generate the mask using the `vector.create_mask` operation which accepts
+  // the number of meaningful elements (i.e. the legth of the prefix of 1s).
+  // To compute the number of meaningful elements we subtract the current value
+  // of the iteration variable from the upper bound of the loop. Example:
+  //
+  //     // 500 is the upper bound of the loop
+  //     #map = affine_map<(d0) -> (500 - d0)>
+  //     %elems_left = affine.apply #map(%iv)
+  //     %mask = vector.create_mask %elems_left : vector<128xi1>
+
+  Location loc = vecForOp.getLoc();
+
+  // First we get the upper bound of the loop using `affine.apply` or
+  // `affine.min`.
+  AffineMap ubMap = vecForOp.getUpperBoundMap();
+  Value ub;
+  if (ubMap.getNumResults() == 1)
+    ub = state.builder.create<AffineApplyOp>(loc, vecForOp.getUpperBoundMap(),
+                                             vecForOp.getUpperBoundOperands());
+  else
+    ub = state.builder.create<AffineMinOp>(loc, vecForOp.getUpperBoundMap(),
+                                           vecForOp.getUpperBoundOperands());
+  // Then we compute the number of (original) iterations left in the loop.
+  AffineExpr subExpr =
+      state.builder.getAffineDimExpr(0) - state.builder.getAffineDimExpr(1);
+  Value itersLeft =
+      makeComposedAffineApply(state.builder, loc, AffineMap::get(2, 0, subExpr),
+                              {ub, vecForOp.getInductionVar()});
+  // If the affine maps were successfully composed then `ub` is unneeded.
+  if (ub.use_empty())
+    ub.getDefiningOp()->erase();
+  // Finally we create the mask.
+  Type maskTy = VectorType::get(state.strategy->vectorSizes,
+                                state.builder.getIntegerType(1));
+  Value mask =
+      state.builder.create<vector::CreateMaskOp>(loc, maskTy, itersLeft);
+
+  LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a mask:\n"
+                    << itersLeft << "\n"
+                    << mask << "\n");
+
+  state.vecLoopToMask[vecForOp] = mask;
+  return mask;
+}
+
 /// Returns true if the provided value is vector uniform given the vectorization
 /// strategy.
 // TODO: For now, only values that are invariants to all the loops in the
@@ -1023,26 +1223,40 @@
   return transfer;
 }
 
+/// Returns true if `value` is a constant equal to the neutral element of the
+/// given vectorizable reduction.
+static bool isNeutralElementConst(AtomicRMWKind reductionKind, Value value,
+                                  VectorizationState &state) {
+  Type scalarTy = value.getType();
+  if (!VectorType::isValidElementType(scalarTy))
+    return false;
+  Attribute valueAttr = getIdentityValueAttr(reductionKind, scalarTy,
+                                             state.builder, value.getLoc());
+  if (auto constOp = dyn_cast_or_null<ConstantOp>(value.getDefiningOp()))
+    return constOp.value() == valueAttr;
+  return false;
+}
+
 /// Vectorizes a loop with the vectorization strategy in 'state'. A new loop is
 /// created and registered as replacement for the scalar loop. The builder's
 /// insertion point is set to the new loop's body so that subsequent vectorized
 /// operations are inserted into the new loop. If the loop is a vector
 /// dimension, the step of the newly created loop will reflect the vectorization
 /// factor used to vectorized that dimension.
-// TODO: Add support for 'iter_args'. Related operands and results will be
-// vectorized at this point.
 static Operation *vectorizeAffineForOp(AffineForOp forOp,
                                        VectorizationState &state) {
   const VectorizationStrategy &strategy = *state.strategy;
   auto loopToVecDimIt = strategy.loopToVectorDim.find(forOp);
   bool isLoopVecDim = loopToVecDimIt != strategy.loopToVectorDim.end();
 
-  // We only support 'iter_args' when the loop is not one of the vector
-  // dimensions.
-  // TODO: Support vector dimension loops. They require special handling:
-  // generate horizontal reduction, last-value extraction, etc.
-  if (forOp.getNumIterOperands() > 0 && isLoopVecDim)
+  // TODO: Vectorization of reduction loops is not supported for non-unit steps.
+  if (isLoopVecDim && forOp.getNumIterOperands() > 0 && forOp.getStep() != 1) {
+    LLVM_DEBUG(
+        dbgs()
+        << "\n[early-vect]+++++ unsupported step size for reduction loop: "
+        << forOp.getStep() << "\n");
     return nullptr;
+  }
 
   // If we are vectorizing a vector dimension, compute a new step for the new
   // vectorized loop using the vectorization factor for the vector dimension.
@@ -1057,10 +1271,31 @@
     newStep = forOp.getStep();
   }
 
+  // Get information about reduction kinds.
+  ArrayRef<LoopReduction> reductions;
+  if (isLoopVecDim && forOp.getNumIterOperands() > 0) {
+    auto it = strategy.reductionLoops.find(forOp);
+    assert(it != strategy.reductionLoops.end() &&
+           "Reduction descriptors not found when vectorizing a reduction loop");
+    reductions = it->second;
+    assert(reductions.size() == forOp.getNumIterOperands() &&
+           "The size of reductions array must match the number of iter_args");
+  }
+
   // Vectorize 'iter_args'.
   SmallVector<Value, 8> vecIterOperands;
-  for (auto operand : forOp.getIterOperands())
-    vecIterOperands.push_back(vectorizeOperand(operand, state));
+  if (!isLoopVecDim) {
+    for (auto operand : forOp.getIterOperands())
+      vecIterOperands.push_back(vectorizeOperand(operand, state));
+  } else {
+    // For reduction loops we need to pass a vector of neutral elements as an
+    // initial value of the accumulator. We will add the original initial value
+    // later.
+    for (auto redAndOperand : llvm::zip(reductions, forOp.getIterOperands())) {
+      vecIterOperands.push_back(createInitialVector(
+          std::get<0>(redAndOperand).kind, std::get<1>(redAndOperand), state));
+    }
+  }
 
   auto vecForOp = state.builder.create<AffineForOp>(
       forOp.getLoc(), forOp.getLowerBoundOperands(), forOp.getLowerBoundMap(),
@@ -1075,13 +1310,16 @@
   // Register loop-related replacements:
   //   1) The new vectorized loop is registered as vector replacement of the
   //      scalar loop.
-  //      TODO: Support reductions along the vector dimension.
   //   2) The new iv of the vectorized loop is registered as scalar replacement
   //      since a scalar copy of the iv will prevail in the vectorized loop.
   //      TODO: A vector replacement will also be added in the future when
   //      vectorization of linear ops is supported.
   //   3) The new 'iter_args' region arguments are registered as vector
   //      replacements since they have been vectorized.
+  //   4) If the loop performs a reduction along the vector dimension, a
+  //      `vector.reduction` or similar op is inserted for each resulting value
+  //      of the loop and its scalar value replaces the corresponding scalar
+  //      result of the loop.
   state.registerOpVectorReplacement(forOp, vecForOp);
   state.registerValueScalarReplacement(forOp.getInductionVar(),
                                        vecForOp.getInductionVar());
@@ -1090,12 +1328,37 @@
     state.registerBlockArgVectorReplacement(std::get<0>(iterTuple),
                                             std::get<1>(iterTuple));
 
+  if (isLoopVecDim) {
+    for (unsigned i = 0; i < vecForOp.getNumIterOperands(); ++i) {
+      // First, we reduce the vector returned from the loop into a scalar.
+      Value reducedRes =
+          getVectorReductionOp(reductions[i].kind, state.builder,
+                               vecForOp.getLoc(), vecForOp.getResult(i));
+      LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a vector reduction: "
+                        << reducedRes);
+      // Then we combine it with the original (scalar) initial value unless it
+      // is equal to the neutral element of the reduction.
+      Value origInit = forOp.getOperand(forOp.getNumControlOperands() + i);
+      Value finalRes = reducedRes;
+      if (!isNeutralElementConst(reductions[i].kind, origInit, state))
+        finalRes = getReductionOp(reductions[i].kind, state.builder,
+                                  reducedRes.getLoc(), reducedRes, origInit);
+      state.registerLoopResultScalarReplacement(forOp.getResult(i), finalRes);
+    }
+  }
+
   if (isLoopVecDim)
     state.vecLoopToVecDim[vecForOp] = loopToVecDimIt->second;
 
   // Change insertion point so that upcoming vectorized instructions are
   // inserted into the vectorized loop's body.
   state.builder.setInsertionPointToStart(vecForOp.getBody());
+
+  // If this is a reduction loop then we may need to create a mask to filter out
+  // garbage in the last iteration.
+  if (isLoopVecDim && forOp.getNumIterOperands() > 0)
+    createMask(vecForOp, state);
+
   return vecForOp;
 }
 
@@ -1133,11 +1396,35 @@
 
 /// Vectorizes a yield operation by widening its types. The builder's insertion
 /// point is set after the vectorized parent op to continue vectorizing the
-/// operations after the parent op.
+/// operations after the parent op. When vectorizing a reduction loop a mask may
+/// be used to prevent adding garbage values to the accumulator.
 static Operation *vectorizeAffineYieldOp(AffineYieldOp yieldOp,
                                          VectorizationState &state) {
   Operation *newYieldOp = widenOp(yieldOp, state);
   Operation *newParentOp = state.builder.getInsertionBlock()->getParentOp();
+
+  // If there is a mask for this loop then we must prevent garbage values from
+  // being added to the accumulator by inserting `select` operations, for
+  // example:
+  //
+  //   %res = addf %acc, %val : vector<128xf32>
+  //   %res_masked = select %mask, %res, %acc : vector<128xi1>, vector<128xf32>
+  //   affine.yield %res_masked : vector<128xf32>
+  //
+  if (Value mask = state.vecLoopToMask.lookup(newParentOp)) {
+    state.builder.setInsertionPoint(newYieldOp);
+    for (unsigned i = 0; i < newYieldOp->getNumOperands(); ++i) {
+      Value result = newYieldOp->getOperand(i);
+      Value iterArg = cast<AffineForOp>(newParentOp).getRegionIterArgs()[i];
+      Value maskedResult = state.builder.create<SelectOp>(result.getLoc(), mask,
+                                                          result, iterArg);
+      LLVM_DEBUG(
+          dbgs() << "\n[early-vect]+++++ masking a yielded vector value: "
+                 << maskedResult);
+      newYieldOp->setOperand(i, maskedResult);
+    }
+  }
+
   state.builder.setInsertionPointAfter(newParentOp);
   return newYieldOp;
 }
@@ -1241,8 +1528,12 @@
   auto opVecResult = rootLoop.walk<WalkOrder::PreOrder>([&](Operation *op) {
     LLVM_DEBUG(dbgs() << "[early-vect]+++++ Vectorizing: " << *op);
     Operation *vectorOp = vectorizeOneOperation(op, state);
-    if (!vectorOp)
+    if (!vectorOp) {
+      LLVM_DEBUG(
+          dbgs() << "[early-vect]+++++ failed vectorizing the operation: "
+                 << *op << "\n");
       return WalkResult::interrupt();
+    }
 
     return WalkResult::advance();
   });
@@ -1258,6 +1549,11 @@
     return failure();
   }
 
+  // Replace results of reduction loops with the scalar values computed using
+  // `vector.reduce` or similar ops.
+  for (auto resPair : state.loopResultScalarReplacement)
+    resPair.first.replaceAllUsesWith(resPair.second);
+
   assert(state.opVectorReplacement.count(rootLoop) == 1 &&
          "Expected vector replacement for loop nest");
   LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");
@@ -1325,10 +1621,15 @@
 /// vectorization factors in 'vectorSizes'. By default, each vectorization
 /// factor is applied inner-to-outer to the loops of each loop nest.
 /// 'fastestVaryingPattern' can be optionally used to provide a different loop
-/// vectorization order.
+/// vectorization order. `reductionLoops` can be provided to specify loops which
+/// can be vectorized along the reduction dimension.
 static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops,
                            ArrayRef<int64_t> vectorSizes,
-                           ArrayRef<int64_t> fastestVaryingPattern) {
+                           ArrayRef<int64_t> fastestVaryingPattern,
+                           const ReductionLoopMap &reductionLoops) {
+  assert((reductionLoops.empty() || vectorSizes.size() == 1) &&
+         "Vectorizing reductions is supported only for 1-D vectors");
+
   // Compute 1-D, 2-D or 3-D loop pattern to be matched on the target loops.
   Optional<NestedPattern> pattern =
       makePattern(loops, vectorSizes.size(), fastestVaryingPattern);
@@ -1359,6 +1660,7 @@
       VectorizationStrategy strategy;
       // TODO: depending on profitability, elect to reduce the vector size.
       strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end());
+      strategy.reductionLoops = reductionLoops;
       if (failed(analyzeProfitability(match.getMatchedChildren(), 1,
                                       patternDepth, &strategy))) {
         continue;
@@ -1396,15 +1698,37 @@
     return signalPassFailure();
   }
 
+  if (vectorizeReductions && vectorSizes.size() != 1) {
+    f.emitError("Vectorizing reductions is supported only for 1-D vectors.");
+    return signalPassFailure();
+  }
+
   DenseSet<Operation *> parallelLoops;
-  f.walk([&parallelLoops](AffineForOp loop) {
-    if (isLoopParallel(loop))
-      parallelLoops.insert(loop);
-  });
+  ReductionLoopMap reductionLoops;
+
+  // If 'vectorize-reduction=true' is provided, we also populate the
+  // `reductionLoops` map.
+  if (vectorizeReductions) {
+    f.walk([&parallelLoops, &reductionLoops](AffineForOp loop) {
+      SmallVector<LoopReduction, 2> reductions;
+      if (isLoopParallel(loop, &reductions)) {
+        parallelLoops.insert(loop);
+        // If it's not a reduction loop, adding it to the map is not necessary.
+        if (!reductions.empty())
+          reductionLoops[loop] = reductions;
+      }
+    });
+  } else {
+    f.walk([&parallelLoops](AffineForOp loop) {
+      if (isLoopParallel(loop))
+        parallelLoops.insert(loop);
+    });
+  }
 
   // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
   NestedPatternContext mlContext;
-  vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);
+  vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern,
+                 reductionLoops);
 }
 
 /// Verify that affine loops in 'loops' meet the nesting criteria expected by
@@ -1452,12 +1776,17 @@
 /// factor is applied inner-to-outer to the loops of each loop nest.
 /// 'fastestVaryingPattern' can be optionally used to provide a different loop
 /// vectorization order.
+/// If `reductionLoops` is not empty, the given reduction loops may be
+/// vectorized along the reduction dimension.
+/// TODO: Vectorizing reductions is supported only for 1-D vectorization.
 void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
                           ArrayRef<int64_t> vectorSizes,
-                          ArrayRef<int64_t> fastestVaryingPattern) {
+                          ArrayRef<int64_t> fastestVaryingPattern,
+                          const ReductionLoopMap &reductionLoops) {
   // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
   NestedPatternContext mlContext;
-  vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern);
+  vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern,
+                 reductionLoops);
 }
 
 /// External utility to vectorize affine loops from a single loop nest using an
Index: mlir/lib/Dialect/StandardOps/IR/Ops.cpp
===================================================================
--- mlir/lib/Dialect/StandardOps/IR/Ops.cpp
+++ mlir/lib/Dialect/StandardOps/IR/Ops.cpp
@@ -352,6 +352,53 @@
   return success();
 }
 
+/// Returns the identity value attribute associated with an AtomicRMWKind op.
+Attribute mlir::getIdentityValueAttr(AtomicRMWKind kind, Type resultType,
+                                     OpBuilder &builder, Location loc) {
+  switch (kind) {
+  case AtomicRMWKind::addf:
+  case AtomicRMWKind::addi:
+    return builder.getZeroAttr(resultType);
+  case AtomicRMWKind::muli:
+    return builder.getIntegerAttr(resultType, 1);
+  case AtomicRMWKind::mulf:
+    return builder.getFloatAttr(resultType, 1);
+  // TODO: Add remaining reduction operations.
+  default:
+    (void)emitOptionalError(loc, "Reduction operation type not supported");
+    break;
+  }
+  return nullptr;
+}
+
+/// Returns the identity value associated with an AtomicRMWKind op.
+Value mlir::getIdentityValue(AtomicRMWKind op, Type resultType,
+                             OpBuilder &builder, Location loc) {
+  Attribute attr = getIdentityValueAttr(op, resultType, builder, loc);
+  return builder.create<ConstantOp>(loc, attr);
+}
+
+/// Return the value obtained by applying the reduction operation kind
+/// associated with a binary AtomicRMWKind op to `lhs` and `rhs`.
+Value mlir::getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
+                           Value lhs, Value rhs) {
+  switch (op) {
+  case AtomicRMWKind::addf:
+    return builder.create<AddFOp>(loc, lhs, rhs);
+  case AtomicRMWKind::addi:
+    return builder.create<AddIOp>(loc, lhs, rhs);
+  case AtomicRMWKind::mulf:
+    return builder.create<MulFOp>(loc, lhs, rhs);
+  case AtomicRMWKind::muli:
+    return builder.create<MulIOp>(loc, lhs, rhs);
+  // TODO: Add remaining reduction operations.
+  default:
+    (void)emitOptionalError(loc, "Reduction operation type not supported");
+    break;
+  }
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // GenericAtomicRMWOp
 //===----------------------------------------------------------------------===//
Index: mlir/lib/Dialect/Vector/VectorOps.cpp
===================================================================
--- mlir/lib/Dialect/Vector/VectorOps.cpp
+++ mlir/lib/Dialect/Vector/VectorOps.cpp
@@ -333,6 +333,28 @@
   p << " : " << op.vector().getType() << " into " << op.dest().getType();
 }
 
+Value mlir::vector::getVectorReductionOp(AtomicRMWKind op, OpBuilder &builder,
+                                         Location loc, Value vector) {
+  Type scalarType = vector.getType().cast<ShapedType>().getElementType();
+  switch (op) {
+  case AtomicRMWKind::addf:
+  case AtomicRMWKind::addi:
+    return builder.create<vector::ReductionOp>(vector.getLoc(), scalarType,
+                                               builder.getStringAttr("add"),
+                                               vector, ValueRange{});
+  case AtomicRMWKind::mulf:
+  case AtomicRMWKind::muli:
+    return builder.create<vector::ReductionOp>(vector.getLoc(), scalarType,
+                                               builder.getStringAttr("mul"),
+                                               vector, ValueRange{});
+  // TODO: Add remaining reduction operations.
+  default:
+    (void)emitOptionalError(loc, "Reduction operation type not supported");
+    break;
+  }
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // ContractionOp
 //===----------------------------------------------------------------------===//
Index: mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
===================================================================
--- mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
+++ mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
@@ -590,3 +590,24 @@
 // CHECK:         }
 // CHECK:         vector.transfer_write %[[last_val]], %{{.*}} : vector<128xf32>, memref<256xf32>
 // CHECK:       }
+
+// -----
+
+// The inner reduction loop '%j' is not vectorized if we do not request
+// reduction vectorization.
+
+func @vec_vecdim_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %red_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vec_vecdim_reduction_rejected
+// CHECK-NOT: vector
Index: mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
===================================================================
--- /dev/null
+++ mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
@@ -0,0 +1,468 @@
+// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0 vectorize-reductions=true" -split-input-file | FileCheck %s
+
+// The inner reduction loop '%j' is vectorized.
+
+func @vecdim_reduction(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %red_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:           affine.yield %[[add]] : vector<128xf32>
+// CHECK:         }
+// CHECK:         %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]] : vector<128xf32> into f32
+// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK:       }
+
+// -----
+
+// The inner reduction loop '%j' is vectorized. (The order of addf's operands is
+// different than in the previous test case).
+
+func @vecdim_reduction_comm(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %ld, %red_iter : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction_comm
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[ld]], %[[red_iter]] : vector<128xf32>
+// CHECK:           affine.yield %[[add]] : vector<128xf32>
+// CHECK:         }
+// CHECK:         %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]] : vector<128xf32> into f32
+// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK:       }
+
+// -----
+
+// The inner reduction loop '%j' is vectorized. Transforming the input before
+// performing the accumulation doesn't cause any problem.
+
+func @vecdim_reduction_expsin(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %sin = math.sin %ld : f32
+     %exp = math.exp %sin : f32
+     %add = addf %red_iter, %exp : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction_expsin
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[sin:.*]] = math.sin %[[ld]]
+// CHECK:           %[[exp:.*]] = math.exp %[[sin]]
+// CHECK:           %[[add:.*]] = addf %[[red_iter]], %[[exp]] : vector<128xf32>
+// CHECK:           affine.yield %[[add]] : vector<128xf32>
+// CHECK:         }
+// CHECK:         %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]] : vector<128xf32> into f32
+// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK:       }
+
+// -----
+
+// Two reductions at the same time. The inner reduction loop '%j' is vectorized.
+
+func @two_vecdim_reductions(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>) {
+ %cst = constant 1.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   // Note that we pass the same constant '1.0' as initial values for both
+   // reductions.
+   %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %part_sum, %ld : f32
+     %mul = mulf %part_prod, %ld : f32
+     affine.yield %add, %mul : f32, f32
+   }
+   affine.store %sum, %out_sum[%i] : memref<256xf32>
+   affine.store %prod, %out_prod[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @two_vecdim_reductions
+// CHECK:       %[[cst:.*]] = constant 1.000000e+00 : f32
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:         %[[vone:.*]] = constant dense<1.000000e+00> : vector<128xf32>
+// CHECK:         %[[vred:.*]]:2 = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[part_sum:.*]] = %[[vzero]], %[[part_prod:.*]] = %[[vone]]) -> (vector<128xf32>, vector<128xf32>) {
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[part_sum]], %[[ld]] : vector<128xf32>
+// CHECK:           %[[mul:.*]] = mulf %[[part_prod]], %[[ld]] : vector<128xf32>
+// CHECK:           affine.yield %[[add]], %[[mul]] : vector<128xf32>, vector<128xf32>
+// CHECK:         }
+// CHECK:         %[[nonfinal_sum:.*]] = vector.reduction "add", %[[vred:.*]]#0 : vector<128xf32> into f32
+// Note that to compute the final sum we need to add the original initial value
+// (%cst) since it is not zero.
+// CHECK:         %[[final_sum:.*]] = addf %[[nonfinal_sum]], %[[cst]] : f32
+// For the final product we don't need to do this additional step because the
+// initial value equals to 1 (the neutral element for multiplication).
+// CHECK:         %[[final_prod:.*]] = vector.reduction "mul", %[[vred:.*]]#1 : vector<128xf32> into f32
+// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK:         affine.store %[[final_prod]], %{{.*}} : memref<256xf32>
+// CHECK:       }
+
+// -----
+
+// The integer case.
+
+func @two_vecdim_reductions_int(%in: memref<256x512xi64>, %out_sum: memref<256xi64>, %out_prod: memref<256xi64>) {
+ %cst0 = constant 0 : i64
+ %cst1 = constant 1 : i64
+ affine.for %i = 0 to 256 {
+   %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst0, %part_prod = %cst1) -> (i64, i64) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xi64>
+     %add = addi %part_sum, %ld : i64
+     %mul = muli %part_prod, %ld : i64
+     affine.yield %add, %mul : i64, i64
+   }
+   affine.store %sum, %out_sum[%i] : memref<256xi64>
+   affine.store %prod, %out_prod[%i] : memref<256xi64>
+ }
+ return
+}
+
+// CHECK-LABEL: @two_vecdim_reductions
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %[[vzero:.*]] = constant dense<0> : vector<128xi64>
+// CHECK:         %[[vone:.*]] = constant dense<1> : vector<128xi64>
+// CHECK:         %[[vred:.*]]:2 = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[part_sum:.*]] = %[[vzero]], %[[part_prod:.*]] = %[[vone]]) -> (vector<128xi64>, vector<128xi64>) {
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi64>, vector<128xi64>
+// CHECK:           %[[add:.*]] = addi %[[part_sum]], %[[ld]] : vector<128xi64>
+// CHECK:           %[[mul:.*]] = muli %[[part_prod]], %[[ld]] : vector<128xi64>
+// CHECK:           affine.yield %[[add]], %[[mul]] : vector<128xi64>, vector<128xi64>
+// CHECK:         }
+// CHECK:         %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]]#0 : vector<128xi64> into i64
+// CHECK:         %[[final_prod:.*]] = vector.reduction "mul", %[[vred:.*]]#1 : vector<128xi64> into i64
+// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xi64>
+// CHECK:         affine.store %[[final_prod]], %{{.*}} : memref<256xi64>
+// CHECK:       }
+
+// -----
+
+// The outer reduction loop '%j' is vectorized.
+
+func @vecdim_reduction_nested(%in: memref<256x512xf32>, %out: memref<1xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ %outer_red = affine.for %j = 0 to 512 iter_args(%outer_iter = %cst) -> (f32) {
+   %inner_red = affine.for %i = 0 to 256 iter_args(%inner_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %inner_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   %outer_add = addf %outer_iter, %inner_red : f32
+   affine.yield %outer_add : f32
+ }
+ affine.store %outer_red, %out[0] : memref<1xf32>
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction_nested
+// CHECK:       %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:       %[[outer_red:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[outer_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:         %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:         %[[inner_red:.*]] = affine.for %{{.*}} = 0 to 256 iter_args(%[[inner_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[inner_iter]], %[[ld]] : vector<128xf32>
+// CHECK:           affine.yield %[[add]] : vector<128xf32>
+// CHECK:         }
+// CHECK:         %[[outer_add:.*]] = addf %[[outer_iter]], %[[inner_red]] : vector<128xf32>
+// CHECK:         affine.yield %[[outer_add]] : vector<128xf32>
+// CHECK:       }
+// CHECK:       %[[final_sum:.*]] = vector.reduction "add", %[[outer_red:.*]] : vector<128xf32> into f32
+// CHECK:       affine.store %[[final_sum]], %{{.*}} : memref<1xf32>
+
+// -----
+
+// The inner reduction loop '%j' computes partial sums as a side effect and
+// is not vectorized.
+
+func @vecdim_partial_sums_1_rejected(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>, %out_partsum: memref<256x512xf32>) {
+ %cst = constant 1.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %part_sum, %ld : f32
+     %mul = mulf %part_prod, %ld : f32
+     affine.store %add, %out_partsum[%i, %j] : memref<256x512xf32>
+     affine.yield %add, %mul : f32, f32
+   }
+   affine.store %sum, %out_sum[%i] : memref<256xf32>
+   affine.store %prod, %out_prod[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_partial_sums_1_rejected
+// CHECK-NOT:   vector
+
+// -----
+
+// The inner reduction loop '%j' computes partial sums as a side effect and
+// is not vectorized.
+
+func @vecdim_partial_sums_2_rejected(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>, %out_partsum: memref<256x512xf32>) {
+ %cst = constant 1.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
+     affine.store %part_sum, %out_partsum[%i, %j] : memref<256x512xf32>
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %part_sum, %ld : f32
+     %mul = mulf %part_prod, %ld : f32
+     affine.yield %add, %mul : f32, f32
+   }
+   affine.store %sum, %out_sum[%i] : memref<256xf32>
+   affine.store %prod, %out_prod[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_partial_sums_2_rejected
+// CHECK-NOT:   vector
+
+// -----
+
+// The inner reduction loop '%j' performs an unknown reduction operation and is
+// not vectorized.
+
+func @vecdim_unknown_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 1.000000e+00 : f32
+ %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+   %add = addf %red_iter, %red_iter : f32
+   affine.yield %add : f32
+ }
+ affine.store %final_red, %out[0] : memref<256xf32>
+ return
+}
+
+// CHECK-LABEL: @vecdim_unknown_reduction_rejected
+// CHECK-NOT:   vector
+
+// -----
+
+// The inner reduction loop '%j' doesn't perform any operation which is not
+// recognized as a standard reduction.
+
+func @vecdim_none_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 1.000000e+00 : f32
+ %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+   affine.yield %red_iter : f32
+ }
+ affine.store %final_red, %out[0] : memref<256xf32>
+ return
+}
+
+// CHECK-LABEL: @vecdim_none_reduction_rejected
+// CHECK-NOT:   vector
+
+// -----
+
+// The number of iterations is not divisable by the vector size, so a mask has
+// to be applied to the last update of the accumulator.
+
+func @vecdim_reduction_masked(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 500 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %red_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK:       #[[$map0:.*]] = affine_map<([[d0:.*]]) -> (-[[d0]] + 500)>
+// CHECK-LABEL: @vecdim_reduction_masked
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:         %[[vred:.*]] = affine.for %[[iv:.*]] = 0 to 500 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map0]](%[[iv]])
+// CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:           %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK:           affine.yield %[[new_acc]] : vector<128xf32>
+// CHECK:         }
+// CHECK:         %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]] : vector<128xf32> into f32
+// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK:       }
+
+// -----
+
+// The number of iteration is not known, so a mask has to be applied.
+
+func @vecdim_reduction_masked_unknown_ub(%in: memref<256x512xf32>, %out: memref<256xf32>, %bnd: index) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to %bnd iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %red_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK:       #[[$map1:.*]] = affine_map<([[d0:.*]]){{\[}}[[s0:.*]]{{\]}} -> (-[[d0]] + [[s0]])>
+// CHECK-LABEL: @vecdim_reduction_masked_unknown_ub
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK:         %[[vred:.*]] = affine.for %[[iv:.*]] = 0 to %[[bnd:.*]] step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map1]](%[[iv]])[%[[bnd]]]
+// CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:           %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK:           affine.yield %[[new_acc]] : vector<128xf32>
+// CHECK:         }
+// CHECK:         %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]] : vector<128xf32> into f32
+// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK:       }
+
+// -----
+
+// The lower bound is nonzero, but the number of iterations is divisible by the
+// vector size, so masking is not needed.
+
+func @vecdim_reduction_nonzero_lb(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 127 to 511 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %red_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction_nonzero_lb
+// CHECK:         %{{.*}} = affine.for %{{.*}} = 127 to 511 step 128 iter_args({{.*}}) -> (vector<128xf32>) {
+// CHECK-NOT:     vector.create_mask
+
+// -----
+
+// The lower bound is unknown, so we need to create a mask.
+
+func @vecdim_reduction_masked_unknown_lb(%in: memref<256x512xf32>, %out: memref<256xf32>, %lb: index) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = %lb to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %red_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK:       #[[$map2:.*]] = affine_map<([[d0:.*]]) -> (-[[d0]] + 512)>
+// CHECK-LABEL: @vecdim_reduction_masked_unknown_lb
+// CHECK:         %{{.*}} = affine.for %[[iv:.*]] = %[[lb:.*]] to 512 step 128 iter_args(%[[red_iter:.*]] = {{.*}}) -> (vector<128xf32>) {
+// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map2]](%[[iv]])
+// CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:           %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK:           affine.yield %[[new_acc]] : vector<128xf32>
+
+// -----
+
+// The upper bound is a minimum expression.
+
+func @vecdim_reduction_complex_ub(%in: memref<256x512xf32>, %out: memref<256xf32>, %M: index, %N: index) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to min affine_map<(d0, d1) -> (d0, d1*2)>(%M, %N) iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %add = addf %red_iter, %ld : f32
+     affine.yield %add : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK:       #[[$map3:.*]] = affine_map<([[d0:.*]], [[d1:.*]]) -> ([[d0]], [[d1]] * 2)>
+// CHECK:       #[[$map3_sub:.*]] = affine_map<([[d0:.*]], [[d1:.*]]) -> ([[d0]] - [[d1]])>
+// CHECK-LABEL: @vecdim_reduction_complex_ub
+// CHECK:         %{{.*}} = affine.for %[[iv:.*]] = 0 to min #[[$map3]](%[[M:.*]], %[[N:.*]]) step 128 iter_args(%[[red_iter:.*]] = {{.*}}) -> (vector<128xf32>) {
+// CHECK:           %[[ub:.*]] = affine.min #[[$map3]](%[[M]], %[[N]])
+// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map3_sub]](%[[ub]], %[[iv]])
+// CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:           %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK:           affine.yield %[[new_acc]] : vector<128xf32>
+
+// -----
+
+// The same mask is applied to both reductions.
+
+func @vecdim_two_reductions_masked(%in: memref<256x512xf32>, %out: memref<512xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %final_sum, %final_expsum = affine.for %j = 0 to 500 iter_args(%sum_iter = %cst, %expsum_iter = %cst) -> (f32, f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %exp = math.exp %ld : f32
+     %add = addf %sum_iter, %ld : f32
+     %eadd = addf %expsum_iter, %exp : f32
+     affine.yield %add, %eadd : f32, f32
+   }
+   affine.store %final_sum, %out[2*%i] : memref<512xf32>
+   affine.store %final_expsum, %out[2*%i + 1] : memref<512xf32>
+ }
+ return
+}
+
+// CHECK:       #[[$map4:.*]] = affine_map<([[d0:.*]]) -> (-[[d0]] + 500)>
+// CHECK-LABEL: @vecdim_two_reductions_masked
+// CHECK:       affine.for %{{.*}} = 0 to 256 {
+// CHECK:         %{{.*}} = affine.for %[[iv:.*]] = 0 to 500 step 128 iter_args(%[[sum_iter:.*]] = {{.*}}, %[[esum_iter:.*]] = {{.*}}) -> (vector<128xf32>, vector<128xf32>) {
+// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map4]](%[[iv]])
+// CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
+// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:           %[[exp:.*]] = math.exp %[[ld]] : vector<128xf32>
+// CHECK:           %[[add:.*]] = addf %[[sum_iter]], %[[ld]] : vector<128xf32>
+// CHECK:           %[[eadd:.*]] = addf %[[esum_iter]], %[[exp]] : vector<128xf32>
+// CHECK:           %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[sum_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK:           %[[new_eacc:.*]] = select %[[mask]], %[[eadd]], %[[esum_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK:           affine.yield %[[new_acc]], %[[new_eacc]] : vector<128xf32>
+// CHECK:         }
Index: mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction_2d.mlir
===================================================================
--- /dev/null
+++ mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction_2d.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=32,256 test-fastest-varying=1,0 vectorize-reductions=true" -verify-diagnostics
+
+// TODO: Vectorization of reduction loops along the reduction dimension is not
+//       supported for higher-rank vectors yet, so we are just checking that an
+//       error message is produced.
+
+// expected-error@+1 {{Vectorizing reductions is supported only for 1-D vectors}}
+func @vecdim_reduction_2d(%in: memref<256x512x1024xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+   %sum_j = affine.for %j = 0 to 512 iter_args(%red_iter_j = %cst) -> (f32) {
+     %sum_k = affine.for %k = 0 to 1024 iter_args(%red_iter_k = %cst) -> (f32) {
+       %ld = affine.load %in[%i, %j, %k] : memref<256x512x1024xf32>
+       %add = addf %red_iter_k, %ld : f32
+       affine.yield %add : f32
+     }
+     %add = addf %red_iter_j, %sum_k : f32
+     affine.yield %add : f32
+   }
+   affine.store %sum_j, %out[%i] : memref<256xf32>
+ }
+ return
+}
+