Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -406,6 +406,10 @@
   /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
   unsigned canMapToVector(Type *T, const DataLayout &DL) const;
 
+  /// Try to convert instructions with extractelement operands into a vector
+  /// form with the single extractelement instruction.
+  bool tryToWidenExtractElementInsts(ArrayRef<WeakVH> ExtractInsts);
+
   /// \returns True if the VectorizableTree is both tiny and not fully
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable();
@@ -413,6 +417,11 @@
 private:
   struct TreeEntry;
 
+  /// Calculates the cost of the transformation of \p VL instructions from
+  /// scalar to vector form.
+  Optional<int> getCost(unsigned Opcode, ArrayRef<Value *> VL, Type *ScalarTy,
+                        Type *VecTy) const;
+
   /// \returns the cost of the vectorizable entry.
   int getEntryCost(TreeEntry *E);
 
@@ -1584,6 +1593,108 @@
   return N;
 }
 
+bool BoUpSLP::tryToWidenExtractElementInsts(ArrayRef<WeakVH> ExtractInsts) {
+  bool Changed = false;
+  // Store the extractelement instruct + tree hight.
+  SmallVector<std::pair<WeakVH, unsigned>, 4> Insts;
+  Insts.reserve(ExtractInsts.size());
+  for (auto &V : ExtractInsts)
+    Insts.emplace_back(V, 0);
+  for (unsigned Idx = 0, E = Insts.size(); Idx < E; ++Idx) {
+    auto *EE = dyn_cast<ExtractElementInst>(Insts[Idx].first);
+    // Skip analysis of already deleted extractelements or instruction trees
+    // with height >= RecursionMaxDepth.
+    if (!EE || Insts[Idx].second == RecursionMaxDepth)
+      continue;
+    unsigned NE = EE->getVectorOperandType()->getNumElements();
+    auto *EIdx = EE->getIndexOperand();
+    for (auto *U : EE->users()) {
+      auto *I = dyn_cast<Instruction>(U);
+      DEBUG(dbgs() << "SLP: trying ti widen instruction " << *I << "\n");
+      // Check if user instruction is vectorizable.
+      if (!I || !isValidElementType(I->getType()) || I->mayHaveSideEffects() ||
+          EphValues.count(I) > 0)
+        continue;
+      Optional<int> Cost = getCost(I->getOpcode(), I, I->getType(),
+                                   VectorType::get(I->getType(), NE));
+      if (!Cost)
+        continue;
+      // Check that all of the user instruction are extractelement from the
+      // vectors of the same size and from the same lanes.
+      if (!std::all_of(I->op_begin(), I->op_end(), [NE, EIdx](const Value *V) {
+            auto *EEI = dyn_cast<ExtractElementInst>(V);
+            return EEI && EEI->getVectorOperandType()->getNumElements() == NE &&
+                   EEI->getIndexOperand() == EIdx;
+          }))
+        continue;
+      int EIdxVal = -1;
+      if (auto *EIdxC = dyn_cast<ConstantInt>(EIdx))
+        if (EIdxC->getValue().isNonNegative())
+          EIdxVal = EIdxC->getZExtValue();
+      // Estimate scalar cost of instructions to be transformed into a vector
+      // form.
+      int ScalarCost = 0;
+      DenseSet<ExtractElementInst *> EEWithCost;
+      EEWithCost.reserve(I->getNumOperands());
+      for (auto *Op : I->operand_values()) {
+        auto *EEOp = cast<ExtractElementInst>(Op);
+        const Instruction *UserLast = EEOp->user_back();
+        // If the only user of the extractelement instruction is the
+        // to-be-vectorized-user instruction, include the cost of this
+        // extractelement into the scalar cost (it safely can be removed during
+        // vectorization).
+        // EEWithCost is used to count the cost of the extractelement
+        // instruction only once.
+        if (EEWithCost.insert(EEOp).second &&
+            (EEOp->hasOneUse() ||
+             std::all_of(EEOp->user_begin(), EEOp->user_end(),
+                         [UserLast](User *U) { return U == UserLast; }))) {
+          ScalarCost +=
+              TTI->getVectorInstrCost(Instruction::ExtractElement,
+                                      EEOp->getVectorOperandType(), EIdxVal);
+        }
+      }
+      // Get the vector cost of the new vectorized code: vectorized user
+      // instruction + extractelement <vec_user_instruction>, i32 EIdx.
+      int VecCost =
+          TTI->getVectorInstrCost(Instruction::ExtractElement,
+                                  VectorType::get(I->getType(), NE), EIdxVal);
+      int ResCost = Cost.getValue() + VecCost - ScalarCost;
+      if (ResCost >= -SLPCostThreshold)
+        continue;
+      DEBUG(dbgs() << "SLP: Decided to widen cost=" << ResCost << "\n");
+      // Generate vector code instead of the scalar one.
+      Builder.SetInsertPoint(I->getParent(), ++I->getIterator());
+      Builder.SetCurrentDebugLocation(I->getDebugLoc());
+      // Create vectorized version of the user instruction.
+      Instruction *NewI = I->clone();
+      NewI->mutateType(VectorType::get(I->getType(), NE));
+      for (unsigned Idx = 0, EIdx = NewI->getNumOperands(); Idx < EIdx; ++Idx) {
+        auto *EE = cast<ExtractElementInst>(NewI->getOperand(Idx));
+        NewI->setOperand(Idx, EE->getVectorOperand());
+        // Remove extractelement instruction only iff this is its last use.
+        if (EE->hasOneUse()) {
+          EE->replaceAllUsesWith(UndefValue::get(EE->getType()));
+          eraseInstruction(EE);
+        }
+      }
+      Builder.Insert(NewI, "widen.vect");
+      // %widen.extract = extractelement <ty x n> %widen.vect, i32 Idx
+      Value *NewEE = Builder.CreateExtractElement(NewI, EIdx, "widen.extract");
+      // Replace uses of the scalar instruction by the %widen.extract
+      // instruction.
+      I->replaceAllUsesWith(NewEE);
+      eraseInstruction(I);
+      // Add %widen.extract to the list of the extractelement instructions for
+      // future analysis of possibly vectorizable tree.
+      Insts.emplace_back(NewEE, Insts[Idx].second + 1);
+      E = Insts.size();
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
 bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
   assert(Opcode == Instruction::ExtractElement ||
          Opcode == Instruction::ExtractValue);
@@ -1627,8 +1738,175 @@
   return true;
 }
 
+Optional<int> BoUpSLP::getCost(unsigned Opcode, ArrayRef<Value *> VL,
+                               Type *ScalarTy, Type *VecTy) const {
+  assert(ScalarTy && VecTy &&
+         "both ScalarTy/VectorTy parameters must be specified.");
+  assert(Opcode && "Expected non-null opcode.");
+  auto *VL0 = cast<Instruction>(VL[0]);
+  int VecCost;
+  int ScalarCost;
+  switch (Opcode) {
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    Type *SrcTy = VL0->getOperand(0)->getType();
+    VecCost = TTI->getCastInstrCost(
+        Opcode, VecTy, VectorType::get(SrcTy, VecTy->getVectorNumElements()));
+
+    // Calculate the cost of this instruction.
+    ScalarCost = VL.size() * TTI->getCastInstrCost(Opcode, ScalarTy, SrcTy);
+    break;
+  }
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+  case Instruction::Select: {
+    // Calculate the cost of this instruction.
+    VecCost = TTI->getCmpSelInstrCost(
+        Opcode, VecTy,
+        VectorType::get(Type::getInt1Ty(VL0->getContext()),
+                        VecTy->getVectorNumElements()));
+    ScalarCost =
+        VL.size() * TTI->getCmpSelInstrCost(Opcode, ScalarTy,
+                                            Type::getInt1Ty(VL0->getContext()));
+    break;
+  }
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    // Certain instructions can be cheaper to vectorize if they have a
+    // constant second vector operand.
+    TargetTransformInfo::OperandValueKind Op1VK =
+        TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueKind Op2VK =
+        TargetTransformInfo::OK_UniformConstantValue;
+    TargetTransformInfo::OperandValueProperties Op1VP =
+        TargetTransformInfo::OP_None;
+    TargetTransformInfo::OperandValueProperties Op2VP =
+        TargetTransformInfo::OP_None;
+
+    // If all operands are exactly the same ConstantInt then set the
+    // operand kind to OK_UniformConstantValue.
+    // If instead not all operands are constants, then set the operand kind
+    // to OK_AnyValue. If all operands are constants but not the same,
+    // then set the operand kind to OK_NonUniformConstantValue.
+    ConstantInt *CInt = nullptr;
+    for (unsigned i = 0; i < VL.size(); ++i) {
+      const Instruction *I = cast<Instruction>(VL[i]);
+      if (!isa<ConstantInt>(I->getOperand(1))) {
+        Op2VK = TargetTransformInfo::OK_AnyValue;
+        break;
+      }
+      if (i == 0) {
+        CInt = cast<ConstantInt>(I->getOperand(1));
+        continue;
+      }
+      if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
+          CInt != cast<ConstantInt>(I->getOperand(1)))
+        Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+    }
+    // FIXME: Currently cost of model modification for division by power of
+    // 2 is handled for X86 and AArch64. Add support for other targets.
+    if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
+        CInt->getValue().isPowerOf2())
+      Op2VP = TargetTransformInfo::OP_PowerOf2;
+
+    VecCost =
+        TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, Op1VP, Op2VP);
+    ScalarCost = VL.size() * TTI->getArithmeticInstrCost(
+                                 Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP);
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    TargetTransformInfo::OperandValueKind Op1VK =
+        TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueKind Op2VK =
+        TargetTransformInfo::OK_UniformConstantValue;
+
+    VecCost =
+        TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
+    ScalarCost = VL.size() * TTI->getArithmeticInstrCost(
+                                 Instruction::Add, ScalarTy, Op1VK, Op2VK);
+    break;
+  }
+  case Instruction::Load: {
+    // Cost of wide load - cost of scalar loads.
+    unsigned Alignment = cast<LoadInst>(VL0)->getAlignment();
+    VecCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment,
+                                   /*AddressSpace=*/0);
+    ScalarCost =
+        VL.size() * TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment,
+                                         /*AddressSpace=*/0);
+    break;
+  }
+  case Instruction::Store: {
+    // We know that we can merge the stores. Calculate the cost.
+    auto *SI = cast<StoreInst>(VL0);
+    unsigned Alignment = SI->getAlignment();
+    VecCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, Alignment,
+                                   /*AddressSpace=*/0);
+    ScalarCost =
+        VL.size() * TTI->getMemoryOpCost(Instruction::Store, ScalarTy,
+                                         Alignment, /*AddressSpace=*/0);
+    break;
+  }
+  case Instruction::Call: {
+    CallInst *CI = cast<CallInst>(VL0);
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+    FastMathFlags FMF;
+    if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
+      FMF = FPMO->getFastMathFlags();
+
+    // Calculate the cost of the scalar and vector calls.
+    SmallVector<Type *, 4> VecTys;
+    for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op) {
+      VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
+                                       VecTy->getVectorNumElements()));
+    }
+
+    VecCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
+    ScalarCost =
+        VL.size() * TTI->getIntrinsicInstrCost(
+                        ID, ScalarTy, CI->getFunctionType()->params(), FMF);
+    DEBUG(dbgs() << "SLP: Call cost " << VecCost - ScalarCost << " (" << VecCost
+                 << "-" << ScalarCost << ")"
+                 << " for " << *CI << "\n");
+
+    break;
+  }
+  default:
+    return None;
+  }
+  return VecCost - ScalarCost;
+}
+
 int BoUpSLP::getEntryCost(TreeEntry *E) {
-  ArrayRef<Value*> VL = E->Scalars;
+  ArrayRef<Value *> VL = E->Scalars;
 
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -1651,218 +1929,67 @@
   }
   unsigned Opcode = getSameOpcode(VL);
   assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
-  Instruction *VL0 = cast<Instruction>(VL[0]);
   switch (Opcode) {
-    case Instruction::PHI: {
-      return 0;
-    }
-    case Instruction::ExtractValue:
-    case Instruction::ExtractElement: {
-      if (canReuseExtract(VL, Opcode)) {
-        int DeadCost = 0;
-        for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-          Instruction *E = cast<Instruction>(VL[i]);
-          // If all users are going to be vectorized, instruction can be
-          // considered as dead.
-          // The same, if have only one user, it will be vectorized for sure.
-          if (E->hasOneUse() ||
-              std::all_of(E->user_begin(), E->user_end(), [this](User *U) {
-                return ScalarToTreeEntry.count(U) > 0;
-              }))
-            // Take credit for instruction that will become dead.
-            DeadCost +=
-                TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
-        }
-        return -DeadCost;
-      }
-      return getGatherCost(VecTy);
-    }
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::FPExt:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::SIToFP:
-    case Instruction::UIToFP:
-    case Instruction::Trunc:
-    case Instruction::FPTrunc:
-    case Instruction::BitCast: {
-      Type *SrcTy = VL0->getOperand(0)->getType();
-
-      // Calculate the cost of this instruction.
-      int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
-                                                         VL0->getType(), SrcTy);
-
-      VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
-      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
-      return VecCost - ScalarCost;
-    }
-    case Instruction::FCmp:
-    case Instruction::ICmp:
-    case Instruction::Select: {
-      // Calculate the cost of this instruction.
-      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
-      int ScalarCost = VecTy->getNumElements() *
-          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
-      int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
-      return VecCost - ScalarCost;
-    }
-    case Instruction::Add:
-    case Instruction::FAdd:
-    case Instruction::Sub:
-    case Instruction::FSub:
-    case Instruction::Mul:
-    case Instruction::FMul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::FDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::FRem:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor: {
-      // Certain instructions can be cheaper to vectorize if they have a
-      // constant second vector operand.
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
-      TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_UniformConstantValue;
-      TargetTransformInfo::OperandValueProperties Op1VP =
-          TargetTransformInfo::OP_None;
-      TargetTransformInfo::OperandValueProperties Op2VP =
-          TargetTransformInfo::OP_None;
-
-      // If all operands are exactly the same ConstantInt then set the
-      // operand kind to OK_UniformConstantValue.
-      // If instead not all operands are constants, then set the operand kind
-      // to OK_AnyValue. If all operands are constants but not the same,
-      // then set the operand kind to OK_NonUniformConstantValue.
-      ConstantInt *CInt = nullptr;
-      for (unsigned i = 0; i < VL.size(); ++i) {
-        const Instruction *I = cast<Instruction>(VL[i]);
-        if (!isa<ConstantInt>(I->getOperand(1))) {
-          Op2VK = TargetTransformInfo::OK_AnyValue;
-          break;
-        }
-        if (i == 0) {
-          CInt = cast<ConstantInt>(I->getOperand(1));
-          continue;
-        }
-        if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
-            CInt != cast<ConstantInt>(I->getOperand(1)))
-          Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
-      }
-      // FIXME: Currently cost of model modification for division by power of
-      // 2 is handled for X86 and AArch64. Add support for other targets.
-      if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
-          CInt->getValue().isPowerOf2())
-        Op2VP = TargetTransformInfo::OP_PowerOf2;
-
-      int ScalarCost = VecTy->getNumElements() *
-                       TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
-                                                   Op2VK, Op1VP, Op2VP);
-      int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
-                                                Op1VP, Op2VP);
-      return VecCost - ScalarCost;
-    }
-    case Instruction::GetElementPtr: {
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
-      TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_UniformConstantValue;
-
-      int ScalarCost =
-          VecTy->getNumElements() *
-          TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
-      int VecCost =
-          TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
-
-      return VecCost - ScalarCost;
-    }
-    case Instruction::Load: {
-      // Cost of wide load - cost of scalar loads.
-      unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
-      int ScalarLdCost = VecTy->getNumElements() *
-            TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
-      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
-                                           VecTy, alignment, 0);
-      if (E->NeedToShuffle) {
-        VecLdCost += TTI->getShuffleCost(
-            TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 0);
+  case Instruction::PHI:
+    return 0;
+  case Instruction::ExtractValue:
+  case Instruction::ExtractElement:
+    if (canReuseExtract(VL, Opcode)) {
+      int DeadCost = 0;
+      for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+        Instruction *E = cast<Instruction>(VL[i]);
+        // If all users are going to be vectorized, instruction can be
+        // considered as dead.
+        // The same, if have only one user, it will be vectorized for sure.
+        if (E->hasOneUse() ||
+            std::all_of(E->user_begin(), E->user_end(), [this](User *U) {
+              return ScalarToTreeEntry.count(U) > 0;
+            }))
+          // Take credit for instruction that will become dead.
+          DeadCost +=
+              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
       }
-      return VecLdCost - ScalarLdCost;
+      return -DeadCost;
     }
-    case Instruction::Store: {
-      // We know that we can merge the stores. Calculate the cost.
-      unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
-      int ScalarStCost = VecTy->getNumElements() *
-            TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
-      int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
-                                           VecTy, alignment, 0);
-      return VecStCost - ScalarStCost;
-    }
-    case Instruction::Call: {
-      CallInst *CI = cast<CallInst>(VL0);
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-
-      // Calculate the cost of the scalar and vector calls.
-      SmallVector<Type*, 4> ScalarTys, VecTys;
-      for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
-        ScalarTys.push_back(CI->getArgOperand(op)->getType());
-        VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
-                                         VecTy->getNumElements()));
-      }
-
-      FastMathFlags FMF;
-      if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
-        FMF = FPMO->getFastMathFlags();
-
-      int ScalarCallCost = VecTy->getNumElements() *
-          TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
-
-      int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
-
-      DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
-            << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
-            << " for " << *CI << "\n");
-
-      return VecCallCost - ScalarCallCost;
+    return getGatherCost(VecTy);
+  case Instruction::ShuffleVector: {
+    TargetTransformInfo::OperandValueKind Op1VK =
+        TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueKind Op2VK =
+        TargetTransformInfo::OK_AnyValue;
+    int ScalarCost = 0;
+    int VecCost = 0;
+    for (Value *i : VL) {
+      Instruction *I = cast<Instruction>(i);
+      if (!I)
+        break;
+      ScalarCost +=
+          TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
     }
-    case Instruction::ShuffleVector: {
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
-      TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_AnyValue;
-      int ScalarCost = 0;
-      int VecCost = 0;
-      for (Value *i : VL) {
-        Instruction *I = cast<Instruction>(i);
-        if (!I)
-          break;
-        ScalarCost +=
-            TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
-      }
-      // VecCost is equal to sum of the cost of creating 2 vectors
-      // and the cost of creating shuffle.
-      Instruction *I0 = cast<Instruction>(VL[0]);
-      VecCost =
-          TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
-      Instruction *I1 = cast<Instruction>(VL[1]);
-      VecCost +=
-          TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
-      VecCost +=
-          TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
-      return VecCost - ScalarCost;
+    // VecCost is equal to sum of the cost of creating 2 vectors
+    // and the cost of creating shuffle.
+    Instruction *I0 = cast<Instruction>(VL[0]);
+    VecCost = TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
+    Instruction *I1 = cast<Instruction>(VL[1]);
+    VecCost +=
+        TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
+    VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
+    return VecCost - ScalarCost;
+  }
+  case Instruction::Load: {
+    int Cost = getCost(Opcode, VL, ScalarTy, VecTy).getValue();
+    if (E->NeedToShuffle) {
+      Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                  VecTy, 0);
     }
-    default:
-      llvm_unreachable("Unknown instruction");
+    return Cost;
+  }
+  default:
+    if (Optional<int> Cost = getCost(Opcode, VL, ScalarTy, VecTy))
+      return Cost.getValue();
+    break;
   }
+  llvm_unreachable("Unknown instruction");
 }
 
 bool BoUpSLP::isFullyVectorizableTinyTree() {
@@ -4871,6 +4998,7 @@
 
   VisitedInstrs.clear();
 
+  SmallVector<WeakVH, 4> ExtractInsts;
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
     // We may go through BB multiple times so skip the one we have checked.
     if (!VisitedInstrs.insert(&*it).second)
@@ -4986,8 +5114,15 @@
         }
       }
     }
+
+    if (auto *EE = dyn_cast<ExtractElementInst>(it)) {
+      ExtractInsts.push_back(EE);
+      continue;
+    }
   }
 
+  Changed |= R.tryToWidenExtractElementInsts(ExtractInsts);
+
   return Changed;
 }
 
Index: test/Transforms/SLPVectorizer/X86/vector.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/vector.ll
+++ test/Transforms/SLPVectorizer/X86/vector.ll
@@ -17,17 +17,15 @@
 define i1 @cmpv2f32(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @cmpv2f32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[Y0:%.*]] = extractelement <2 x i32> [[Y:%.*]], i32 0
-; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[X0]], [[Y0]]
-; CHECK-NEXT:    br i1 [[CMP0]], label [[IF:%.*]], label [[ENDIF:%.*]]
+; CHECK-NEXT:    [[WIDEN_VECT1:%.*]] = icmp eq <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[WIDEN_EXTRACT2:%.*]] = extractelement <2 x i1> [[WIDEN_VECT1]], i32 0
+; CHECK-NEXT:    br i1 [[WIDEN_EXTRACT2]], label [[IF:%.*]], label [[ENDIF:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i32> [[X]], i32 1
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i32 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[X1]], [[Y1]]
+; CHECK-NEXT:    [[WIDEN_VECT:%.*]] = icmp eq <2 x i32> [[X]], [[Y]]
+; CHECK-NEXT:    [[WIDEN_EXTRACT:%.*]] = extractelement <2 x i1> [[WIDEN_VECT]], i32 1
 ; CHECK-NEXT:    br label [[ENDIF]]
 ; CHECK:       endif:
-; CHECK-NEXT:    [[AND_OF_CMPS:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP1]], [[IF]] ]
+; CHECK-NEXT:    [[AND_OF_CMPS:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[WIDEN_EXTRACT]], [[IF]] ]
 ; CHECK-NEXT:    ret i1 [[AND_OF_CMPS]]
 ;
   entry: