Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -413,6 +413,11 @@
 private:
   struct TreeEntry;
 
+  /// Calculates the cost of the transformation of \p VL instructions from
+  /// scalar to vector form.
+  Optional<int> getCost(unsigned Opcode, ArrayRef<Value *> VL, Type *ScalarTy,
+                        Type *VecTy) const;
+
   /// \returns the cost of the vectorizable entry.
   int getEntryCost(TreeEntry *E);
 
@@ -1627,8 +1632,175 @@
   return true;
 }
 
+Optional<int> BoUpSLP::getCost(unsigned Opcode, ArrayRef<Value *> VL,
+                               Type *ScalarTy, Type *VecTy) const {
+  assert(ScalarTy && VecTy &&
+         "both ScalarTy/VectorTy parameters must be specified.");
+  assert(Opcode && "Expected non-null opcode.");
+  auto *VL0 = cast<Instruction>(VL[0]);
+  int VecCost;
+  int ScalarCost;
+  switch (Opcode) {
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    Type *SrcTy = VL0->getOperand(0)->getType();
+    VecCost = TTI->getCastInstrCost(
+        Opcode, VecTy, VectorType::get(SrcTy, VecTy->getVectorNumElements()));
+
+    // Calculate the cost of this instruction.
+    ScalarCost = VL.size() * TTI->getCastInstrCost(Opcode, ScalarTy, SrcTy);
+    break;
+  }
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+  case Instruction::Select: {
+    // Calculate the cost of this instruction.
+    VecCost = TTI->getCmpSelInstrCost(
+        Opcode, VecTy,
+        VectorType::get(Type::getInt1Ty(VL0->getContext()),
+                        VecTy->getVectorNumElements()));
+    ScalarCost =
+        VL.size() * TTI->getCmpSelInstrCost(Opcode, ScalarTy,
+                                            Type::getInt1Ty(VL0->getContext()));
+    break;
+  }
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    // Certain instructions can be cheaper to vectorize if they have a
+    // constant second vector operand.
+    TargetTransformInfo::OperandValueKind Op1VK =
+        TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueKind Op2VK =
+        TargetTransformInfo::OK_UniformConstantValue;
+    TargetTransformInfo::OperandValueProperties Op1VP =
+        TargetTransformInfo::OP_None;
+    TargetTransformInfo::OperandValueProperties Op2VP =
+        TargetTransformInfo::OP_None;
+
+    // If all operands are exactly the same ConstantInt then set the
+    // operand kind to OK_UniformConstantValue.
+    // If instead not all operands are constants, then set the operand kind
+    // to OK_AnyValue. If all operands are constants but not the same,
+    // then set the operand kind to OK_NonUniformConstantValue.
+    ConstantInt *CInt = nullptr;
+    for (unsigned i = 0; i < VL.size(); ++i) {
+      const Instruction *I = cast<Instruction>(VL[i]);
+      if (!isa<ConstantInt>(I->getOperand(1))) {
+        Op2VK = TargetTransformInfo::OK_AnyValue;
+        break;
+      }
+      if (i == 0) {
+        CInt = cast<ConstantInt>(I->getOperand(1));
+        continue;
+      }
+      if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
+          CInt != cast<ConstantInt>(I->getOperand(1)))
+        Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+    }
+    // FIXME: Currently cost of model modification for division by power of
+    // 2 is handled for X86 and AArch64. Add support for other targets.
+    if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
+        CInt->getValue().isPowerOf2())
+      Op2VP = TargetTransformInfo::OP_PowerOf2;
+
+    VecCost =
+        TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, Op1VP, Op2VP);
+    ScalarCost = VL.size() * TTI->getArithmeticInstrCost(
+                                 Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP);
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    TargetTransformInfo::OperandValueKind Op1VK =
+        TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueKind Op2VK =
+        TargetTransformInfo::OK_UniformConstantValue;
+
+    VecCost =
+        TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
+    ScalarCost = VL.size() * TTI->getArithmeticInstrCost(
+                                 Instruction::Add, ScalarTy, Op1VK, Op2VK);
+    break;
+  }
+  case Instruction::Load: {
+    // Cost of wide load - cost of scalar loads.
+    unsigned Alignment = cast<LoadInst>(VL0)->getAlignment();
+    VecCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment,
+                                   /*AddressSpace=*/0);
+    ScalarCost =
+        VL.size() * TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment,
+                                         /*AddressSpace=*/0);
+    break;
+  }
+  case Instruction::Store: {
+    // We know that we can merge the stores. Calculate the cost.
+    auto *SI = cast<StoreInst>(VL0);
+    unsigned Alignment = SI->getAlignment();
+    VecCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, Alignment,
+                                   /*AddressSpace=*/0);
+    ScalarCost =
+        VL.size() * TTI->getMemoryOpCost(Instruction::Store, ScalarTy,
+                                         Alignment, /*AddressSpace=*/0);
+    break;
+  }
+  case Instruction::Call: {
+    CallInst *CI = cast<CallInst>(VL0);
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+    FastMathFlags FMF;
+    if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
+      FMF = FPMO->getFastMathFlags();
+
+    // Calculate the cost of the scalar and vector calls.
+    SmallVector<Type *, 4> VecTys;
+    for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op) {
+      VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
+                                       VecTy->getVectorNumElements()));
+    }
+
+    VecCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
+    ScalarCost =
+        VL.size() * TTI->getIntrinsicInstrCost(
+                        ID, ScalarTy, CI->getFunctionType()->params(), FMF);
+    DEBUG(dbgs() << "SLP: Call cost " << VecCost - ScalarCost << " (" << VecCost
+                 << "-" << ScalarCost << ")"
+                 << " for " << *CI << "\n");
+
+    break;
+  }
+  default:
+    return None;
+  }
+  return VecCost - ScalarCost;
+}
+
 int BoUpSLP::getEntryCost(TreeEntry *E) {
-  ArrayRef<Value*> VL = E->Scalars;
+  ArrayRef<Value *> VL = E->Scalars;
 
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -1651,218 +1823,67 @@
   }
   unsigned Opcode = getSameOpcode(VL);
   assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
-  Instruction *VL0 = cast<Instruction>(VL[0]);
   switch (Opcode) {
-    case Instruction::PHI: {
-      return 0;
-    }
-    case Instruction::ExtractValue:
-    case Instruction::ExtractElement: {
-      if (canReuseExtract(VL, Opcode)) {
-        int DeadCost = 0;
-        for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-          Instruction *E = cast<Instruction>(VL[i]);
-          // If all users are going to be vectorized, instruction can be
-          // considered as dead.
-          // The same, if have only one user, it will be vectorized for sure.
-          if (E->hasOneUse() ||
-              std::all_of(E->user_begin(), E->user_end(), [this](User *U) {
-                return ScalarToTreeEntry.count(U) > 0;
-              }))
-            // Take credit for instruction that will become dead.
-            DeadCost +=
-                TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
-        }
-        return -DeadCost;
-      }
-      return getGatherCost(VecTy);
-    }
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::FPExt:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::SIToFP:
-    case Instruction::UIToFP:
-    case Instruction::Trunc:
-    case Instruction::FPTrunc:
-    case Instruction::BitCast: {
-      Type *SrcTy = VL0->getOperand(0)->getType();
-
-      // Calculate the cost of this instruction.
-      int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
-                                                         VL0->getType(), SrcTy);
-
-      VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
-      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
-      return VecCost - ScalarCost;
-    }
-    case Instruction::FCmp:
-    case Instruction::ICmp:
-    case Instruction::Select: {
-      // Calculate the cost of this instruction.
-      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
-      int ScalarCost = VecTy->getNumElements() *
-          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
-      int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
-      return VecCost - ScalarCost;
-    }
-    case Instruction::Add:
-    case Instruction::FAdd:
-    case Instruction::Sub:
-    case Instruction::FSub:
-    case Instruction::Mul:
-    case Instruction::FMul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::FDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::FRem:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor: {
-      // Certain instructions can be cheaper to vectorize if they have a
-      // constant second vector operand.
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
-      TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_UniformConstantValue;
-      TargetTransformInfo::OperandValueProperties Op1VP =
-          TargetTransformInfo::OP_None;
-      TargetTransformInfo::OperandValueProperties Op2VP =
-          TargetTransformInfo::OP_None;
-
-      // If all operands are exactly the same ConstantInt then set the
-      // operand kind to OK_UniformConstantValue.
-      // If instead not all operands are constants, then set the operand kind
-      // to OK_AnyValue. If all operands are constants but not the same,
-      // then set the operand kind to OK_NonUniformConstantValue.
-      ConstantInt *CInt = nullptr;
-      for (unsigned i = 0; i < VL.size(); ++i) {
-        const Instruction *I = cast<Instruction>(VL[i]);
-        if (!isa<ConstantInt>(I->getOperand(1))) {
-          Op2VK = TargetTransformInfo::OK_AnyValue;
-          break;
-        }
-        if (i == 0) {
-          CInt = cast<ConstantInt>(I->getOperand(1));
-          continue;
-        }
-        if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
-            CInt != cast<ConstantInt>(I->getOperand(1)))
-          Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
-      }
-      // FIXME: Currently cost of model modification for division by power of
-      // 2 is handled for X86 and AArch64. Add support for other targets.
-      if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
-          CInt->getValue().isPowerOf2())
-        Op2VP = TargetTransformInfo::OP_PowerOf2;
-
-      int ScalarCost = VecTy->getNumElements() *
-                       TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
-                                                   Op2VK, Op1VP, Op2VP);
-      int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
-                                                Op1VP, Op2VP);
-      return VecCost - ScalarCost;
-    }
-    case Instruction::GetElementPtr: {
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
-      TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_UniformConstantValue;
-
-      int ScalarCost =
-          VecTy->getNumElements() *
-          TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
-      int VecCost =
-          TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
-
-      return VecCost - ScalarCost;
-    }
-    case Instruction::Load: {
-      // Cost of wide load - cost of scalar loads.
-      unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
-      int ScalarLdCost = VecTy->getNumElements() *
-            TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
-      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
-                                           VecTy, alignment, 0);
-      if (E->NeedToShuffle) {
-        VecLdCost += TTI->getShuffleCost(
-            TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 0);
+  case Instruction::PHI:
+    return 0;
+  case Instruction::ExtractValue:
+  case Instruction::ExtractElement:
+    if (canReuseExtract(VL, Opcode)) {
+      int DeadCost = 0;
+      for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+        Instruction *E = cast<Instruction>(VL[i]);
+        // If all users are going to be vectorized, instruction can be
+        // considered as dead.
+        // The same, if have only one user, it will be vectorized for sure.
+        if (E->hasOneUse() ||
+            std::all_of(E->user_begin(), E->user_end(), [this](User *U) {
+              return ScalarToTreeEntry.count(U) > 0;
+            }))
+          // Take credit for instruction that will become dead.
+          DeadCost +=
+              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
       }
-      return VecLdCost - ScalarLdCost;
-    }
-    case Instruction::Store: {
-      // We know that we can merge the stores. Calculate the cost.
-      unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
-      int ScalarStCost = VecTy->getNumElements() *
-            TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
-      int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
-                                           VecTy, alignment, 0);
-      return VecStCost - ScalarStCost;
+      return -DeadCost;
     }
-    case Instruction::Call: {
-      CallInst *CI = cast<CallInst>(VL0);
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-
-      // Calculate the cost of the scalar and vector calls.
-      SmallVector<Type*, 4> ScalarTys, VecTys;
-      for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
-        ScalarTys.push_back(CI->getArgOperand(op)->getType());
-        VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
-                                         VecTy->getNumElements()));
-      }
-
-      FastMathFlags FMF;
-      if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
-        FMF = FPMO->getFastMathFlags();
-
-      int ScalarCallCost = VecTy->getNumElements() *
-          TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
-
-      int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
-
-      DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
-            << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
-            << " for " << *CI << "\n");
-
-      return VecCallCost - ScalarCallCost;
+    return getGatherCost(VecTy);
+  case Instruction::ShuffleVector: {
+    TargetTransformInfo::OperandValueKind Op1VK =
+        TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueKind Op2VK =
+        TargetTransformInfo::OK_AnyValue;
+    int ScalarCost = 0;
+    int VecCost = 0;
+    for (Value *i : VL) {
+      Instruction *I = cast<Instruction>(i);
+      if (!I)
+        break;
+      ScalarCost +=
+          TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
     }
-    case Instruction::ShuffleVector: {
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
-      TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_AnyValue;
-      int ScalarCost = 0;
-      int VecCost = 0;
-      for (Value *i : VL) {
-        Instruction *I = cast<Instruction>(i);
-        if (!I)
-          break;
-        ScalarCost +=
-            TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
-      }
-      // VecCost is equal to sum of the cost of creating 2 vectors
-      // and the cost of creating shuffle.
-      Instruction *I0 = cast<Instruction>(VL[0]);
-      VecCost =
-          TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
-      Instruction *I1 = cast<Instruction>(VL[1]);
-      VecCost +=
-          TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
-      VecCost +=
-          TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
-      return VecCost - ScalarCost;
+    // VecCost is equal to sum of the cost of creating 2 vectors
+    // and the cost of creating shuffle.
+    Instruction *I0 = cast<Instruction>(VL[0]);
+    VecCost = TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
+    Instruction *I1 = cast<Instruction>(VL[1]);
+    VecCost +=
+        TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
+    VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
+    return VecCost - ScalarCost;
+  }
+  case Instruction::Load: {
+    int Cost = getCost(Opcode, VL, ScalarTy, VecTy).getValue();
+    if (E->NeedToShuffle) {
+      Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                  VecTy, 0);
     }
-    default:
-      llvm_unreachable("Unknown instruction");
+    return Cost;
+  }
+  default:
+    if (Optional<int> Cost = getCost(Opcode, VL, ScalarTy, VecTy))
+      return Cost.getValue();
+    break;
   }
+  llvm_unreachable("Unknown instruction");
 }
 
 bool BoUpSLP::isFullyVectorizableTinyTree() {