Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -965,6 +965,29 @@
   bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                              ReductionFlags Flags) const;
 
+  /// \returns True if the target wants to handle the given reduction idiom in
+  /// scalarized shuffle form instead of vectorized shuffle form.
+  /// E.g.
+  ///
+  /// Scalarized shuffle form:
+  ///  %rdx.shuf = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32>
+  ///                            <i32 2, i32 3, i32 undef, i32 undef>
+  ///  %bin.rdx = fadd fast <4 x float> %a, %rdx.shuf
+  ///  %0 = extractelement <4 x float> %bin.rdx, i32 0
+  ///  %1 = extractelement <4 x float> %bin.rdx, i32 1
+  ///  %res = fadd fast float %0, %1 // scalar operation follows.
+  ///
+  /// Vectorized shuffle form:
+  ///  %rdx.shuf = shufflevector <4 x float> %a, <4 x float> undef,
+  ///                            <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  ///  %bin.rdx = fadd fast <4 x float> %a, %rdx.shuf
+  ///  %rdx.shuf1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef,
+  ///                             <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  ///  %bin.rdx2 = fadd fast <4 x float> %bin.rdx, %rdx.shuf1 // vector operation
+  ///                                                         // follows.
+  ///  %res = extractelement <4 x float> %bin.rdx2, i32 0
+  bool useScalarizedShuffleReduction() const;
+
   /// \returns True if the target wants to expand the given reduction intrinsic
   /// into a shuffle sequence.
   bool shouldExpandReduction(const IntrinsicInst *II) const;
@@ -1166,6 +1189,7 @@
                                         VectorType *VecTy) const = 0;
   virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                                      ReductionFlags) const = 0;
+  virtual bool useScalarizedShuffleReduction() const = 0;
   virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
   virtual int getInstructionLatency(const Instruction *I) = 0;
 };
@@ -1561,6 +1585,9 @@
                              ReductionFlags Flags) const override {
     return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
   }
+  bool useScalarizedShuffleReduction() const override {
+    return Impl.useScalarizedShuffleReduction();
+  }
   bool shouldExpandReduction(const IntrinsicInst *II) const override {
     return Impl.shouldExpandReduction(II);
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -564,6 +564,10 @@
     return false;
   }
 
+  bool useScalarizedShuffleReduction() const {
+    return false;
+  }
+
   bool shouldExpandReduction(const IntrinsicInst *II) const {
     return true;
   }
Index: include/llvm/Transforms/Utils/LoopUtils.h
===================================================================
--- include/llvm/Transforms/Utils/LoopUtils.h
+++ include/llvm/Transforms/Utils/LoopUtils.h
@@ -517,10 +517,14 @@
                     ArrayRef<Value *> RedOps = None);
 
 /// Generates a vector reduction using shufflevectors to reduce the value.
+/// If \p ScalarizationFollows is set, getShuffleReduction() generates
+/// scalar result instead of vector result (Shuffles are followed by a
+/// scalar operation instead of a vector operation).
 Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
                            RecurrenceDescriptor::MinMaxRecurrenceKind
                                MinMaxKind = RecurrenceDescriptor::MRK_Invalid,
-                           ArrayRef<Value *> RedOps = None);
+                           ArrayRef<Value *> RedOps = None,
+                           bool ScalarizationFollows = false);
 
 /// Create a target reduction of the given vector. The reduction operation
 /// is described by the \p Opcode parameter. min/max reductions require
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -622,6 +622,10 @@
   return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
 }
 
+bool TargetTransformInfo::useScalarizedShuffleReduction() const {
+  return TTIImpl->useScalarizedShuffleReduction();
+}
+
 bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
   return TTIImpl->shouldExpandReduction(II);
 }
Index: lib/Transforms/Utils/LoopUtils.cpp
===================================================================
--- lib/Transforms/Utils/LoopUtils.cpp
+++ lib/Transforms/Utils/LoopUtils.cpp
@@ -1558,11 +1558,33 @@
   return Result;
 }
 
+// Helper to generate reduction operation.
+Value *createReductionOp(IRBuilder<> &Builder, Value *Lhs, Value *Rhs,
+                         unsigned Op,
+                         RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
+                         ArrayRef<Value *> RedOps) {
+  Value *Res;
+  if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+    // Floating point operations had to be 'fast' to enable the reduction.
+    Res = addFastMathFlag(
+        Builder.CreateBinOp((Instruction::BinaryOps)Op, Lhs, Rhs, "bin.rdx"));
+  } else {
+    assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+           "Invalid min/max");
+    Res = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, Lhs, Rhs);
+  }
+  if (!RedOps.empty())
+    propagateIRFlags(Res, RedOps);
+
+  return Res;
+}
+
 // Helper to generate a log2 shuffle reduction.
 Value *
 llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
                           RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
-                          ArrayRef<Value *> RedOps) {
+                          ArrayRef<Value *> RedOps,
+                          bool ScalarizedShufRed) {
   unsigned VF = Src->getType()->getVectorNumElements();
   // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
   // and vector ops, reducing the set of values being computed by half each
@@ -1571,7 +1593,9 @@
          "Reduction emission only supported for pow2 vectors!");
   Value *TmpVec = Src;
   SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
-  for (unsigned i = VF; i != 1; i >>= 1) {
+  unsigned UB = ScalarizedShufRed ? 2 : 1;
+
+  for (unsigned i = VF; i != UB; i >>= 1) {
     // Move the upper half of the vector to the lower half.
     for (unsigned j = 0; j != i / 2; ++j)
       ShuffleMask[j] = Builder.getInt32(i / 2 + j);
@@ -1583,22 +1607,20 @@
     Value *Shuf = Builder.CreateShuffleVector(
         TmpVec, UndefValue::get(TmpVec->getType()),
         ConstantVector::get(ShuffleMask), "rdx.shuf");
-
-    if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
-      // Floating point operations had to be 'fast' to enable the reduction.
-      TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op,
-                                                   TmpVec, Shuf, "bin.rdx"));
-    } else {
-      assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
-             "Invalid min/max");
-      TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec,
-                                                    Shuf);
-    }
-    if (!RedOps.empty())
-      propagateIRFlags(TmpVec, RedOps);
+    TmpVec = createReductionOp(Builder, TmpVec, Shuf, Op, MinMaxKind, RedOps);
   }
-  // The result is in the first element of the vector.
-  return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+
+  if (!ScalarizedShufRed)
+    // The result is in the first element of the vector.
+    return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+
+  // The result comes from performing the scalar operation on the first two
+  // elements of the vector.
+  return createReductionOp(
+    Builder,
+    Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)),
+    Builder.CreateExtractElement(TmpVec, Builder.getInt32(1)),
+    Op, MinMaxKind, RedOps);
 }
 
 /// Create a simple vector reduction specified by an opcode and some
@@ -1675,7 +1697,9 @@
   }
   if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags))
     return BuildFunc();
-  return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps);
+
+  return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps,
+                             TTI->useScalarizedShuffleReduction());
 }
 
 /// Create a vector reduction using a given recurrence descriptor.