Index: lib/Transforms/InstCombine/InstCombineAddSub.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1347,6 +1347,9 @@
     I.setHasNoUnsignedWrap(true);
   }
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return Changed ? &I : nullptr;
 }
 
@@ -1452,6 +1455,9 @@
       return replaceInstUsesWith(I, V);
   }
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return Changed ? &I : nullptr;
 }
 
@@ -1723,6 +1729,9 @@
     I.setHasNoUnsignedWrap(true);
   }
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return Changed ? &I : nullptr;
 }
 
@@ -1777,5 +1786,8 @@
       return replaceInstUsesWith(I, V);
   }
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return nullptr;
 }
Index: lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1427,6 +1427,9 @@
   if (Instruction *Select = foldBoolSextMaskToSelect(I))
     return Select;
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return Changed ? &I : nullptr;
 }
 
@@ -2342,6 +2345,9 @@
     }
   }
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return Changed ? &I : nullptr;
 }
 
@@ -2637,5 +2643,8 @@
   if (Instruction *CastedXor = foldCastedBitwiseLogic(I))
     return CastedXor;
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return Changed ? &I : nullptr;
 }
Index: lib/Transforms/InstCombine/InstCombineCompares.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -4528,6 +4528,10 @@
     if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X)
       return foldICmpAddOpConst(I, X, Cst, I.getSwappedPredicate());
   }
+
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return Changed ? &I : nullptr;
 }
 
@@ -4948,5 +4952,8 @@
         return new FCmpInst(I.getPredicate(), LHSExt->getOperand(0),
                             RHSExt->getOperand(0));
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return Changed ? &I : nullptr;
 }
Index: lib/Transforms/InstCombine/InstCombineInternal.h
===================================================================
--- lib/Transforms/InstCombine/InstCombineInternal.h
+++ lib/Transforms/InstCombine/InstCombineInternal.h
@@ -565,6 +565,11 @@
   Value *SimplifyVectorOp(BinaryOperator &Inst);
   Value *SimplifyBSwap(BinaryOperator &Inst);
 
+  /// Try to combine instructions with all ExtractElement operands only
+  /// that extract from vector operands of the same type, the same vector
+  /// size at the same index, into a vector form with single resulting
+  /// ExtractElement instruction only.
+  Value *WidenScalarOp(Instruction &Inst);
 
   /// Given a binary operator, cast instruction, or select which has a PHI node
   /// as operand #0, see if we can fold the instruction into the PHI (which is
Index: lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -463,6 +463,9 @@
     I.setHasNoUnsignedWrap(true);
   }
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return Changed ? &I : nullptr;
 }
 
@@ -778,6 +781,9 @@
       break;
   }
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return Changed ? &I : nullptr;
 }
 
@@ -1176,6 +1182,9 @@
         return Inst;
     }
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return nullptr;
 }
 
@@ -1260,6 +1269,9 @@
     }
   }
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return nullptr;
 }
 
@@ -1430,6 +1442,9 @@
     return &I;
   }
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return nullptr;
 }
 
@@ -1518,6 +1533,9 @@
     return SelectInst::Create(Cmp, Op0, Sub);
   }
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return nullptr;
 }
 
@@ -1593,6 +1611,9 @@
     }
   }
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return nullptr;
 }
 
@@ -1610,5 +1631,8 @@
   if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
     return &I;
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return nullptr;
 }
Index: lib/Transforms/InstCombine/InstCombinePHI.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -1014,5 +1014,8 @@
     if (Instruction *Res = SliceUpIllegalIntegerPHI(PN))
       return Res;
 
+  if (Value *V = WidenScalarOp(PN))
+    return replaceInstUsesWith(PN, V);
+
   return nullptr;
 }
Index: lib/Transforms/InstCombine/InstCombineShifts.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -610,6 +610,9 @@
       return BinaryOperator::CreateMul(X, ConstantExpr::getShl(C2, C1));
   }
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return nullptr;
 }
 
@@ -694,6 +697,10 @@
       return &I;
     }
   }
+
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return nullptr;
 }
 
@@ -763,5 +770,8 @@
   if (MaskedValueIsZero(Op0, APInt::getSignBit(BitWidth), 0, &I))
     return BinaryOperator::CreateLShr(Op0, Op1);
 
+  if (Value *V = WidenScalarOp(I))
+    return replaceInstUsesWith(I, V);
+
   return nullptr;
 }
Index: lib/Transforms/InstCombine/InstructionCombining.cpp
===================================================================
--- lib/Transforms/InstCombine/InstructionCombining.cpp
+++ lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1397,6 +1397,66 @@
   return nullptr;
 }
 
+Value *InstCombiner::WidenScalarOp(Instruction &Inst) {
+  if (Inst.getType()->isVectorTy() ||
+      !VectorType::isValidElementType(Inst.getType()) ||
+      Inst.mayHaveSideEffects())
+    return nullptr;
+  auto *EI = dyn_cast<ExtractElementInst>(*Inst.op_begin());
+  if (!EI)
+    return nullptr;
+  unsigned NE = EI->getVectorOperandType()->getNumElements();
+  Value *EIdx = EI->getIndexOperand();
+  // Check that all operands of the user instruction are extractelement
+  // from the vectors of the same size and from the same lanes and the vector
+  // operand is not an insertelement instruction (this sequence is handled
+  // differently).
+  if (!std::all_of(Inst.op_begin(), Inst.op_end(), [NE, EIdx](const Value *V) {
+        auto *EEI = dyn_cast<ExtractElementInst>(V);
+        return EEI && !isa<InsertElementInst>(EEI->getVectorOperand()) &&
+               EEI->getVectorOperandType()->getNumElements() == NE &&
+               EEI->getIndexOperand() == EIdx;
+      }))
+    return nullptr;
+  int NumVectorizedExtracts = 0;
+  SmallSet<ExtractElementInst *, 4> CountedOperands;
+  for (auto *Op : Inst.operand_values()) {
+    auto *EEOp = cast<ExtractElementInst>(Op);
+    const Instruction *UserLast = EEOp->user_back();
+    // If the only user of the extractelement instruction is the
+    // to-be-vectorized user instruction, count this instruction as the
+    // one to be removed.
+    if (EEOp->hasOneUse() ||
+        (std::all_of(EEOp->user_begin(), EEOp->user_end(),
+                     [UserLast](User *U) { return U == UserLast; }) &&
+         CountedOperands.insert(EEOp).second))
+      ++NumVectorizedExtracts;
+  }
+  // If the number of extractelement instructions to be removed does not exceed
+  // 1, do not widen this instruction sequence.
+  if (NumVectorizedExtracts <= 1)
+    return nullptr;
+  // Generate vector code instead of the scalar one.
+  Instruction *NewI = Inst.clone();
+  NewI->setName("widen.vect");
+  NewI->mutateType(VectorType::get(Inst.getType(), NE));
+  for (unsigned Idx = 0, EIdx = NewI->getNumOperands(); Idx < EIdx; ++Idx) {
+    auto *EE = cast<ExtractElementInst>(NewI->getOperand(Idx));
+    NewI->setOperand(Idx, EE->getVectorOperand());
+  }
+  for (auto *V : Inst.operand_values()) {
+    // Remove extractelement instructions.
+    if (auto *I = dyn_cast<Instruction>(V))
+      replaceInstUsesWith(*I, UndefValue::get(V->getType()));
+  }
+  InsertNewInstWith(NewI, Inst);
+  // %widen.extract = extractelement <ty x n> %widen.vect, i32 Idx
+  // Replace uses of the scalar instruction by the %widen.extract
+  // instruction.
+  return InsertNewInstWith(
+      ExtractElementInst::Create(NewI, EIdx, "widen.extract"), Inst);
+}
+
 Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
 
Index: test/Transforms/InstCombine/bitcast-bigendian.ll
===================================================================
--- test/Transforms/InstCombine/bitcast-bigendian.ll
+++ test/Transforms/InstCombine/bitcast-bigendian.ll
@@ -9,11 +9,10 @@
 
 define float @test2(<2 x float> %A, <2 x i32> %B) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[A:%.*]], i32 1
 ; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i32> [[B:%.*]] to <2 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[BC]], i32 1
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP24]], [[TMP4]]
-; CHECK-NEXT:    ret float [[ADD]]
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fadd <2 x float> [[BC]], [[A:%.*]]
+; CHECK-NEXT:    [[WIDEN_EXTRACT:%.*]] = extractelement <2 x float> [[WIDEN_VECT]], i32 1
+; CHECK-NEXT:    ret float [[WIDEN_EXTRACT]]
 ;
   %tmp28 = bitcast <2 x float> %A to i64
   %tmp23 = trunc i64 %tmp28 to i32
Index: test/Transforms/InstCombine/bitcast.ll
===================================================================
--- test/Transforms/InstCombine/bitcast.ll
+++ test/Transforms/InstCombine/bitcast.ll
@@ -244,11 +244,10 @@
 ; rdar://7892780
 define float @test2(<2 x float> %A, <2 x i32> %B) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i32> [[B:%.*]] to <2 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[BC]], i32 0
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP24]], [[TMP4]]
-; CHECK-NEXT:    ret float [[ADD]]
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fadd <2 x float> [[BC]], [[A:%.*]]
+; CHECK-NEXT:    [[WIDEN_EXTRACT:%.*]] = extractelement <2 x float> [[WIDEN_VECT]], i32 0
+; CHECK-NEXT:    ret float [[WIDEN_EXTRACT]]
 ;
   %tmp28 = bitcast <2 x float> %A to i64  ; <i64> [#uses=2]
   %tmp23 = trunc i64 %tmp28 to i32                ; <i32> [#uses=1]
Index: test/Transforms/InstCombine/type_pun.ll
===================================================================
--- test/Transforms/InstCombine/type_pun.ll
+++ test/Transforms/InstCombine/type_pun.ll
@@ -118,14 +118,12 @@
 ; CHECK-NEXT:    [[SROA_BC:%.*]] = bitcast <16 x i8> [[IN:%.*]] to <4 x i32>
 ; CHECK-NEXT:    br i1 undef, label [[LEFT:%.*]], label [[RIGHT:%.*]]
 ; CHECK:       left:
-; CHECK-NEXT:    [[SROA_EXTRACT1:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0
 ; CHECK-NEXT:    br label [[TAIL:%.*]]
 ; CHECK:       right:
-; CHECK-NEXT:    [[SROA_EXTRACT:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0
 ; CHECK-NEXT:    br label [[TAIL]]
 ; CHECK:       tail:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[SROA_EXTRACT1]], [[LEFT]] ], [ [[SROA_EXTRACT]], [[RIGHT]] ]
-; CHECK-NEXT:    ret i32 [[I]]
+; CHECK-NEXT:    [[WIDEN_EXTRACT:%.*]] = extractelement <4 x i32> [[SROA_BC]], i32 0
+; CHECK-NEXT:    ret i32 [[WIDEN_EXTRACT]]
 ;
 entry:
   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
Index: test/Transforms/InstCombine/x86-avx512.ll
===================================================================
--- test/Transforms/InstCombine/x86-avx512.ll
+++ test/Transforms/InstCombine/x86-avx512.ll
@@ -6,11 +6,10 @@
 
 define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @test_add_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fadd <4 x float> [[TMP1]], [[A:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[WIDEN_VECT]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
   %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
@@ -33,15 +32,14 @@
 
 define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_add_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[WIDEN_EXTRACT:%.*]] = extractelement <4 x float> [[WIDEN_VECT]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], float [[WIDEN_EXTRACT]], float [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[A]], float [[TMP4]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
 ;
   %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
   %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
@@ -83,11 +81,9 @@
 
 define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @test_add_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fadd <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[WIDEN_VECT]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
   %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
@@ -106,15 +102,14 @@
 
 define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_add_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fadd <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[WIDEN_EXTRACT:%.*]] = extractelement <2 x double> [[WIDEN_VECT]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[WIDEN_EXTRACT]], double [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[A]], double [[TMP4]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP5]]
 ;
   %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
   %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
@@ -148,11 +143,10 @@
 
 define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @test_sub_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fsub <4 x float> [[A:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[WIDEN_VECT]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
   %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
@@ -175,15 +169,14 @@
 
 define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_sub_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[WIDEN_EXTRACT:%.*]] = extractelement <4 x float> [[WIDEN_VECT]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], float [[WIDEN_EXTRACT]], float [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[A]], float [[TMP4]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
 ;
   %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
   %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
@@ -225,11 +218,9 @@
 
 define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @test_sub_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[WIDEN_VECT]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
   %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
@@ -248,15 +239,14 @@
 
 define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_sub_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[WIDEN_EXTRACT:%.*]] = extractelement <2 x double> [[WIDEN_VECT]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[WIDEN_EXTRACT]], double [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[A]], double [[TMP4]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP5]]
 ;
   %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
   %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
@@ -290,11 +280,10 @@
 
 define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @test_mul_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fmul <4 x float> [[TMP1]], [[A:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[WIDEN_VECT]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
   %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
@@ -317,15 +306,14 @@
 
 define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mul_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fmul <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[WIDEN_EXTRACT:%.*]] = extractelement <4 x float> [[WIDEN_VECT]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], float [[WIDEN_EXTRACT]], float [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[A]], float [[TMP4]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
 ;
   %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
   %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
@@ -367,11 +355,9 @@
 
 define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @test_mul_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fmul <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[WIDEN_VECT]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
   %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
@@ -390,15 +376,14 @@
 
 define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_mul_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fmul <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[WIDEN_EXTRACT:%.*]] = extractelement <2 x double> [[WIDEN_VECT]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[WIDEN_EXTRACT]], double [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[A]], double [[TMP4]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP5]]
 ;
   %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
   %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
@@ -432,11 +417,10 @@
 
 define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @test_div_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fdiv <4 x float> [[A:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[WIDEN_VECT]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
   %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
@@ -459,15 +443,14 @@
 
 define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
 ; CHECK-LABEL: @test_div_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
-; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fdiv <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[WIDEN_EXTRACT:%.*]] = extractelement <4 x float> [[WIDEN_VECT]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], float [[WIDEN_EXTRACT]], float [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[A]], float [[TMP4]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
 ;
   %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
   %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
@@ -509,11 +492,9 @@
 
 define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @test_div_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[WIDEN_VECT]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
   %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
@@ -532,15 +513,14 @@
 
 define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; CHECK-LABEL: @test_div_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
-; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[WIDEN_EXTRACT:%.*]] = extractelement <2 x double> [[WIDEN_VECT]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[WIDEN_EXTRACT]], double [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[A]], double [[TMP4]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP5]]
 ;
   %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
   %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)