Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -413,6 +413,13 @@
 private:
   struct TreeEntry;
 
+  /// Calculates the cost of the scalar sequence of instructions (if only \p
+  /// ScalarTy is specified), the vector sequence (if only \p VecTy is
+  /// specified) or the vector - scalar cost (if both \p ScalarTy and \p
+  /// VecTy are specified).
+  Optional<int> getCostOrDiff(unsigned Opcode, ArrayRef<Value *> VL,
+                              Type *ScalarTy, Type *VecTy = nullptr) const;
+
   /// \returns the cost of the vectorizable entry.
   int getEntryCost(TreeEntry *E);
 
@@ -1630,6 +1637,199 @@
   return true;
 }
 
+Optional<int> BoUpSLP::getCostOrDiff(unsigned Opcode, ArrayRef<Value *> VL,
+                                     Type *ScalarTy, Type *VecTy) const {
+  assert((ScalarTy || VecTy) &&
+         "at least one of ScalarTy/VectorTy parameters must be specified.");
+  assert(Opcode && "Expected non-null opcode.");
+  auto *VL0 = cast<Instruction>(VL[0]);
+  int VecCost;
+  int ScalarCost;
+  // Do we need the difference of the scalar and vector costs.
+  bool Diff = ScalarTy && VecTy;
+  switch (Opcode) {
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    Type *SrcTy = VL0->getOperand(0)->getType();
+    if (VecTy) {
+      VecCost = TTI->getCastInstrCost(
+          Opcode, VecTy, VectorType::get(SrcTy, VecTy->getVectorNumElements()));
+      if (!Diff)
+        return VecCost;
+    }
+
+    // Calculate the cost of this instruction.
+    ScalarCost = VL.size() * TTI->getCastInstrCost(Opcode, ScalarTy, SrcTy);
+    break;
+  }
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+  case Instruction::Select: {
+    // Calculate the cost of this instruction.
+    if (VecTy) {
+      VecCost = TTI->getCmpSelInstrCost(
+          Opcode, VecTy,
+          VectorType::get(Type::getInt1Ty(VL0->getContext()),
+                          VecTy->getVectorNumElements()));
+      if (!Diff)
+        return VecCost;
+    }
+    ScalarCost =
+        VL.size() * TTI->getCmpSelInstrCost(Opcode, ScalarTy,
+                                            Type::getInt1Ty(VL0->getContext()));
+    break;
+  }
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    // Certain instructions can be cheaper to vectorize if they have a
+    // constant second vector operand.
+    TargetTransformInfo::OperandValueKind Op1VK =
+        TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueKind Op2VK =
+        TargetTransformInfo::OK_UniformConstantValue;
+    TargetTransformInfo::OperandValueProperties Op1VP =
+        TargetTransformInfo::OP_None;
+    TargetTransformInfo::OperandValueProperties Op2VP =
+        TargetTransformInfo::OP_None;
+
+    // If all operands are exactly the same ConstantInt then set the
+    // operand kind to OK_UniformConstantValue.
+    // If instead not all operands are constants, then set the operand kind
+    // to OK_AnyValue. If all operands are constants but not the same,
+    // then set the operand kind to OK_NonUniformConstantValue.
+    ConstantInt *CInt = nullptr;
+    for (unsigned i = 0; i < VL.size(); ++i) {
+      const Instruction *I = cast<Instruction>(VL[i]);
+      if (!isa<ConstantInt>(I->getOperand(1))) {
+        Op2VK = TargetTransformInfo::OK_AnyValue;
+        break;
+      }
+      if (i == 0) {
+        CInt = cast<ConstantInt>(I->getOperand(1));
+        continue;
+      }
+      if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
+          CInt != cast<ConstantInt>(I->getOperand(1)))
+        Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+    }
+    // FIXME: Currently cost of model modification for division by power of
+    // 2 is handled for X86 and AArch64. Add support for other targets.
+    if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
+        CInt->getValue().isPowerOf2())
+      Op2VP = TargetTransformInfo::OP_PowerOf2;
+
+    if (VecTy) {
+      VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, Op1VP,
+                                            Op2VP);
+      if (!Diff)
+        return VecCost;
+    }
+    ScalarCost = VL.size() * TTI->getArithmeticInstrCost(
+                                 Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP);
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    TargetTransformInfo::OperandValueKind Op1VK =
+        TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueKind Op2VK =
+        TargetTransformInfo::OK_UniformConstantValue;
+
+    if (VecTy) {
+      VecCost =
+          TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
+      if (!Diff)
+        return VecCost;
+    }
+    ScalarCost = VL.size() * TTI->getArithmeticInstrCost(
+                                 Instruction::Add, ScalarTy, Op1VK, Op2VK);
+    break;
+  }
+  case Instruction::Load: {
+    // Cost of wide load - cost of scalar loads.
+    unsigned Alignment = cast<LoadInst>(VL0)->getAlignment();
+    if (VecTy) {
+      VecCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment,
+                                     /*AddressSpace=*/0);
+      if (!Diff)
+        return VecCost;
+    }
+    ScalarCost =
+        VL.size() * TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment,
+                                         /*AddressSpace=*/0);
+    break;
+  }
+  case Instruction::Store: {
+    // We know that we can merge the stores. Calculate the cost.
+    auto *SI = cast<StoreInst>(VL0);
+    unsigned Alignment = SI->getAlignment();
+    if (VecTy) {
+      VecCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, Alignment,
+                                     /*AddressSpace=*/0);
+      if (!Diff)
+        return VecCost;
+    }
+    ScalarCost =
+        VL.size() * TTI->getMemoryOpCost(Instruction::Store, ScalarTy,
+                                         Alignment, /*AddressSpace=*/0);
+    break;
+  }
+  case Instruction::Call: {
+    CallInst *CI = cast<CallInst>(VL0);
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+    FastMathFlags FMF;
+    if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
+      FMF = FPMO->getFastMathFlags();
+
+    if (VecTy) {
+      // Calculate the cost of the scalar and vector calls.
+      SmallVector<Type *, 4> VecTys;
+      for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op) {
+        VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
+                                         VecTy->getVectorNumElements()));
+      }
+
+      VecCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
+      if (!Diff)
+        return VecCost;
+    }
+    ScalarCost =
+        VL.size() * TTI->getIntrinsicInstrCost(
+                        ID, ScalarTy, CI->getFunctionType()->params(), FMF);
+    break;
+  }
+  default:
+    return None;
+  }
+  return Diff ? VecCost - ScalarCost : ScalarCost;
+}
+
 int BoUpSLP::getEntryCost(TreeEntry *E) {
   ArrayRef<Value*> VL = E->Scalars;
 
@@ -1678,7 +1878,7 @@
         }
         return -DeadCost;
       }
-      return getGatherCost(VecTy);
+      return getGatherCost(VecTy);;
     }
     case Instruction::ZExt:
     case Instruction::SExt:
@@ -1691,27 +1891,10 @@
     case Instruction::UIToFP:
     case Instruction::Trunc:
     case Instruction::FPTrunc:
-    case Instruction::BitCast: {
-      Type *SrcTy = VL0->getOperand(0)->getType();
-
-      // Calculate the cost of this instruction.
-      int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
-                                                         VL0->getType(), SrcTy);
-
-      VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
-      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
-      return VecCost - ScalarCost;
-    }
+    case Instruction::BitCast:
     case Instruction::FCmp:
     case Instruction::ICmp:
-    case Instruction::Select: {
-      // Calculate the cost of this instruction.
-      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
-      int ScalarCost = VecTy->getNumElements() *
-          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
-      int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
-      return VecCost - ScalarCost;
-    }
+    case Instruction::Select:
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
@@ -1729,107 +1912,24 @@
     case Instruction::AShr:
     case Instruction::And:
     case Instruction::Or:
-    case Instruction::Xor: {
-      // Certain instructions can be cheaper to vectorize if they have a
-      // constant second vector operand.
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
-      TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_UniformConstantValue;
-      TargetTransformInfo::OperandValueProperties Op1VP =
-          TargetTransformInfo::OP_None;
-      TargetTransformInfo::OperandValueProperties Op2VP =
-          TargetTransformInfo::OP_None;
-
-      // If all operands are exactly the same ConstantInt then set the
-      // operand kind to OK_UniformConstantValue.
-      // If instead not all operands are constants, then set the operand kind
-      // to OK_AnyValue. If all operands are constants but not the same,
-      // then set the operand kind to OK_NonUniformConstantValue.
-      ConstantInt *CInt = nullptr;
-      for (unsigned i = 0; i < VL.size(); ++i) {
-        const Instruction *I = cast<Instruction>(VL[i]);
-        if (!isa<ConstantInt>(I->getOperand(1))) {
-          Op2VK = TargetTransformInfo::OK_AnyValue;
-          break;
-        }
-        if (i == 0) {
-          CInt = cast<ConstantInt>(I->getOperand(1));
-          continue;
-        }
-        if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
-            CInt != cast<ConstantInt>(I->getOperand(1)))
-          Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
-      }
-      // FIXME: Currently cost of model modification for division by power of
-      // 2 is handled for X86 and AArch64. Add support for other targets.
-      if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
-          CInt->getValue().isPowerOf2())
-        Op2VP = TargetTransformInfo::OP_PowerOf2;
-
-      int ScalarCost = VecTy->getNumElements() *
-                       TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
-                                                   Op2VK, Op1VP, Op2VP);
-      int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
-                                                Op1VP, Op2VP);
-      return VecCost - ScalarCost;
-    }
-    case Instruction::GetElementPtr: {
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
-      TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_UniformConstantValue;
-
-      int ScalarCost =
-          VecTy->getNumElements() *
-          TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
-      int VecCost =
-          TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
-
-      return VecCost - ScalarCost;
-    }
+    case Instruction::Xor:
+    case Instruction::GetElementPtr:
+    case Instruction::Store:
+      return getCostOrDiff(Opcode, VL, ScalarTy, VecTy).getValue();
     case Instruction::Load: {
-      // Cost of wide load - cost of scalar loads.
-      unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
-      int ScalarLdCost = VecTy->getNumElements() *
-            TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
-      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
-                                           VecTy, alignment, 0);
+      int Cost = getCostOrDiff(Opcode, VL, ScalarTy, VecTy).getValue();
       if (!E->ShuffleMask.empty()) {
-        VecLdCost += TTI->getShuffleCost(
-            TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 0);
+        Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                    VecTy, 0);
       }
-      return VecLdCost - ScalarLdCost;
-    }
-    case Instruction::Store: {
-      // We know that we can merge the stores. Calculate the cost.
-      unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
-      int ScalarStCost = VecTy->getNumElements() *
-            TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
-      int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
-                                           VecTy, alignment, 0);
-      return VecStCost - ScalarStCost;
+      return Cost;
     }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-
-      // Calculate the cost of the scalar and vector calls.
-      SmallVector<Type*, 4> ScalarTys, VecTys;
-      for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
-        ScalarTys.push_back(CI->getArgOperand(op)->getType());
-        VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
-                                         VecTy->getNumElements()));
-      }
-
-      FastMathFlags FMF;
-      if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
-        FMF = FPMO->getFastMathFlags();
-
-      int ScalarCallCost = VecTy->getNumElements() *
-          TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
+      int ScalarCallCost = getCostOrDiff(Opcode, VL, ScalarTy).getValue();
 
-      int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
+      int VecCallCost =
+          getCostOrDiff(Opcode, VL, /*ScalarTy=*/nullptr, VecTy).getValue();
 
       DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
             << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
@@ -4878,6 +4978,7 @@
 
   VisitedInstrs.clear();
 
+  SmallVector<WeakVH, 4> ExtractInsts;
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
     // We may go through BB multiple times so skip the one we have checked.
     if (!VisitedInstrs.insert(&*it).second)