Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -51,7 +51,7 @@
 /// and if one of the extracts should be transformed to a shufflevector, set
 /// \p ConvertToShuffle to that extract instruction.
 static bool isExtractExtractCheap(Instruction *Ext0, Instruction *Ext1,
-                                  unsigned Opcode,
+                                  const Instruction &OpInst,
                                   const TargetTransformInfo &TTI,
                                   Instruction *&ConvertToShuffle,
                                   unsigned PreferredExtractIndex) {
@@ -63,6 +63,7 @@
   int ScalarOpCost, VectorOpCost;
 
   // Get cost estimates for scalar and vector versions of the operation.
+  unsigned Opcode = OpInst.getOpcode();
   bool IsBinOp = Instruction::isBinaryOp(Opcode);
   if (IsBinOp) {
     ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
@@ -80,6 +81,8 @@
   // both sequences.
   unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
   unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
+  Value *Ext0Vector = Ext0->getOperand(0);
+  Value *Ext1Vector = Ext1->getOperand(0);
 
   int Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement,
                                             VecTy, Ext0Index);
@@ -98,7 +101,7 @@
   // Extra uses of the extracts mean that we include those costs in the
   // vector total because those instructions will not be eliminated.
   int OldCost, NewCost;
-  if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) {
+  if (Ext0Vector == Ext1Vector && Ext0Index == Ext1Index) {
     // Handle a special case. If the 2 extracts are identical, adjust the
     // formulas to account for that. The extra use charge allows for either the
     // CSE'd pattern or an unoptimized form with identical values:
@@ -119,35 +122,52 @@
   if (Ext0Index == Ext1Index) {
     // If the extract indexes are identical, no shuffle is needed.
     ConvertToShuffle = nullptr;
-  } else {
-    if (IsBinOp && DisableBinopExtractShuffle)
-      return true;
-
-    // If we are extracting from 2 different indexes, then one operand must be
-    // shuffled before performing the vector operation. The shuffle mask is
-    // undefined except for 1 lane that is being translated to the remaining
-    // extraction lane. Therefore, it is a splat shuffle. Ex:
-    // ShufMask = { undef, undef, 0, undef }
-    // TODO: The cost model has an option for a "broadcast" shuffle
-    //       (splat-from-element-0), but no option for a more general splat.
-    NewCost +=
-        TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
-
-    // The more expensive extract will be replaced by a shuffle. If the costs
-    // are equal and there is a preferred extract index, shuffle the opposite
-    // operand. Otherwise, replace the extract with the higher index.
-    if (Extract0Cost > Extract1Cost)
-      ConvertToShuffle = Ext0;
-    else if (Extract1Cost > Extract0Cost)
-      ConvertToShuffle = Ext1;
-    else if (PreferredExtractIndex == Ext0Index)
-      ConvertToShuffle = Ext1;
-    else if (PreferredExtractIndex == Ext1Index)
-      ConvertToShuffle = Ext0;
-    else
-      ConvertToShuffle = Ext0Index > Ext1Index ? Ext0 : Ext1;
+    return OldCost < NewCost;
   }
 
+  if (Ext0Vector == Ext1Vector && OpInst.hasOneUse()) {
+    // If match the reduction pattern, do nothing.
+    const auto *ReductionInst = OpInst.user_back();
+    // User instruction is same operation.
+    if (ReductionInst->getOpcode() == Opcode) {
+      assert(ReductionInst->getNumOperands() == 2 &&
+             "Expected a binary operation or compare");
+      Value *SecondOp = ReductionInst->getOperand(0) == &OpInst
+                            ? ReductionInst->getOperand(1)
+                            : ReductionInst->getOperand(0);
+      auto *Ext2 = dyn_cast<ExtractElementInst>(SecondOp);
+      if (Ext2 && Ext2->getOperand(0) == Ext0Vector)
+        return true;
+    }
+  }
+
+  if (IsBinOp && DisableBinopExtractShuffle)
+    return true;
+
+  // If we are extracting from 2 different indexes, then one operand must be
+  // shuffled before performing the vector operation. The shuffle mask is
+  // undefined except for 1 lane that is being translated to the remaining
+  // extraction lane. Therefore, it is a splat shuffle. Ex:
+  // ShufMask = { undef, undef, 0, undef }
+  // TODO: The cost model has an option for a "broadcast" shuffle
+  //       (splat-from-element-0), but no option for a more general splat.
+  NewCost +=
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+
+  // The more expensive extract will be replaced by a shuffle. If the costs
+  // are equal and there is a preferred extract index, shuffle the opposite
+  // operand. Otherwise, replace the extract with the higher index.
+  if (Extract0Cost > Extract1Cost)
+    ConvertToShuffle = Ext0;
+  else if (Extract1Cost > Extract0Cost)
+    ConvertToShuffle = Ext1;
+  else if (PreferredExtractIndex == Ext0Index)
+    ConvertToShuffle = Ext1;
+  else if (PreferredExtractIndex == Ext1Index)
+    ConvertToShuffle = Ext0;
+  else
+    ConvertToShuffle = Ext0Index > Ext1Index ? Ext0 : Ext1;
+
   // Aggressively form a vector op if the cost is equal because the transform
   // may enable further optimization.
   // Codegen can reverse this transform (scalarize) if it was not profitable.
@@ -227,8 +247,7 @@
                                          m_ConstantInt(InsertIndex)));
 
   Instruction *ConvertToShuffle;
-  if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), TTI, ConvertToShuffle,
-                            InsertIndex))
+  if (isExtractExtractCheap(Ext0, Ext1, I, TTI, ConvertToShuffle, InsertIndex))
     return false;
 
   if (ConvertToShuffle) {
Index: llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
===================================================================
--- llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mattr=avx | FileCheck %s --check-prefixes=PHASE
+; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s --check-prefixes=PHASE
 
 declare void @use_i8(i8)
 declare void @use_f32(float)
@@ -486,3 +488,92 @@
   %v3 = insertelement <4 x float> %v2, float %b23, i32 3
   ret <4 x float> %v3
 }
+
+; SIMD reduction operation.
+define i32 @ext_ext_reduction(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @ext_ext_reduction(
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x i32> [[AND]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x i32> [[AND]], i32 1
+; CHECK-NEXT:    [[ADD_1:%.*]] = or i32 [[VECEXT_1]], [[VECEXT]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x i32> [[AND]], i32 2
+; CHECK-NEXT:    [[ADD_2:%.*]] = or i32 [[VECEXT_2]], [[ADD_1]]
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x i32> [[AND]], i32 3
+; CHECK-NEXT:    [[ADD_3:%.*]] = or i32 [[VECEXT_3]], [[ADD_2]]
+; CHECK-NEXT:    ret i32 [[ADD_3]]
+;
+; PHASE-LABEL: @ext_ext_reduction(
+; PHASE-NEXT:    [[AND:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]]
+; PHASE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[AND]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; PHASE-NEXT:    [[BIN_RDX:%.*]] = or <4 x i32> [[AND]], [[RDX_SHUF]]
+; PHASE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; PHASE-NEXT:    [[BIN_RDX2:%.*]] = or <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; PHASE-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; PHASE-NEXT:    ret i32 [[TMP1]]
+;
+  %and = and <4 x i32> %x, %y
+  %vecext = extractelement <4 x i32> %and, i32 0
+  %vecext.1 = extractelement <4 x i32> %and, i32 1
+  %add.1 = or i32 %vecext.1, %vecext
+  %vecext.2 = extractelement <4 x i32> %and, i32 2
+  %add.2 = or i32 %vecext.2, %add.1
+  %vecext.3 = extractelement <4 x i32> %and, i32 3
+  %add.3 = or i32 %vecext.3, %add.2
+  ret i32 %add.3
+}
+
+define i32 @ext_ext_reduction_1(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @ext_ext_reduction_1(
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x i32> [[Y]], i32 1
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[VECEXT_1]], [[VECEXT]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x i32> [[Y]], i32 2
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[VECEXT_2]], [[ADD_1]]
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2
+; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[VECEXT_3]], [[ADD_2]]
+; CHECK-NEXT:    ret i32 [[ADD_3]]
+;
+; PHASE-LABEL: @ext_ext_reduction_1(
+; PHASE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 1
+; PHASE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x i32> [[Y]], i32 2
+; PHASE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; PHASE-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[Y]]
+; PHASE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; PHASE-NEXT:    [[ADD_2:%.*]] = add i32 [[TMP3]], [[VECEXT_1]]
+; PHASE-NEXT:    [[ADD_3:%.*]] = add i32 [[ADD_2]], [[VECEXT_2]]
+; PHASE-NEXT:    ret i32 [[ADD_3]]
+;
+  %vecext = extractelement <4 x i32> %y, i32 0
+  %vecext.1 = extractelement <4 x i32> %y, i32 1
+  %add.1 = add i32 %vecext.1, %vecext
+  %vecext.2 = extractelement <4 x i32> %y, i32 2
+  %add.2 = add i32 %vecext.2, %add.1
+  %vecext.3 = extractelement <4 x i32> %x, i32 2
+  %add.3 = add i32 %vecext.3, %add.2
+  ret i32 %add.3
+}
+
+define i32 @ext_ext_reduction_2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @ext_ext_reduction_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[Y]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[VECEXT_2]], [[TMP3]]
+; CHECK-NEXT:    ret i32 [[ADD_2]]
+;
+; PHASE-LABEL: @ext_ext_reduction_2(
+; PHASE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 1
+; PHASE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; PHASE-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[Y]]
+; PHASE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; PHASE-NEXT:    [[ADD_2:%.*]] = add i32 [[TMP3]], [[VECEXT_1]]
+; PHASE-NEXT:    ret i32 [[ADD_2]]
+;
+  %vecext = extractelement <4 x i32> %y, i32 0
+  %vecext.1 = extractelement <4 x i32> %y, i32 1
+  %add.1 = add i32 %vecext.1, %vecext
+  %vecext.2 = extractelement <4 x i32> %x, i32 2
+  %add.2 = add i32 %vecext.2, %add.1
+  ret i32 %add.2
+}