Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -701,6 +701,30 @@
   bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const;
 
+  /// \return True if the number of call preserved registers is available
+  /// for the target. This is the case when the target has a single class
+  /// of vector register, or when the number of preserved registers is the
+  /// same (e.g. the x86-64 System V ABI, where no XMM, YMM or ZMM registers
+  /// are preserved). If the ABI preserves different numbers depending on
+  /// the class it can't be handled without tracking in which class of
+  /// register a value will be stored. For example, Win64 preserves XMM8-XMM15.
+  /// If the target supports AVX, a value may be in either a XMM or a YMM
+  /// register, and the ABI cannot be handled. However, if the target does
+  /// not support AVX, a vector value can only be in a XMM register and a
+  /// value can be returned for the ABI.
+  bool hasNumberOfCallPreservedRegisters(bool Vector) const;
+
+  /// \return The number of scalar or vector registers preserved across a
+  /// call. The returned value only applies when hasNumberOfPreservedRegisters
+  /// returns true.
+  unsigned getNumberOfCallPreservedRegisters(bool Vector) const;
+
+  /// \return The maximum interleave factor that should be used for the
+  /// target when a loop contains a call. By default, the target
+  /// getMaxInterleaveFactor value is used. The returned value only applies
+  /// when hasNumberOfPreservedRegisters returns false.
+  unsigned getMaxCallInterleaveFactor(unsigned VF) const;
+
   /// \return The size of a cache line in bytes.
   unsigned getCacheLineSize() const;
 
@@ -1086,6 +1110,9 @@
   virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                             Type *Ty) = 0;
   virtual unsigned getNumberOfRegisters(bool Vector) = 0;
+  virtual bool hasNumberOfCallPreservedRegisters(bool Vector) = 0;
+  virtual unsigned getNumberOfCallPreservedRegisters(bool Vector) = 0;
+  virtual unsigned getMaxCallInterleaveFactor(unsigned VF) = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
   virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
@@ -1380,6 +1407,15 @@
   unsigned getNumberOfRegisters(bool Vector) override {
     return Impl.getNumberOfRegisters(Vector);
   }
+  bool hasNumberOfCallPreservedRegisters(bool Vector) override {
+    return Impl.hasNumberOfCallPreservedRegisters(Vector);
+  }
+  unsigned getNumberOfCallPreservedRegisters(bool Vector) override {
+    return Impl.getNumberOfCallPreservedRegisters(Vector);
+  }
+  unsigned getMaxCallInterleaveFactor(unsigned VF) override {
+    return Impl.getMaxCallInterleaveFactor(VF);
+  }
   unsigned getRegisterBitWidth(bool Vector) const override {
     return Impl.getRegisterBitWidth(Vector);
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -350,6 +350,12 @@
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
 
+  bool hasNumberOfCallPreservedRegisters(bool Vector) { return false; };
+
+  unsigned getNumberOfCallPreservedRegisters(bool Vector) { return 0; }
+
+  unsigned getMaxCallInterleaveFactor(unsigned VF) { return 1; }
+
   unsigned getMinVectorRegisterBitWidth() { return 128; }
 
   bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; }
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -441,6 +441,12 @@
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
 
+  unsigned getMaxCallInterleaveFactor(unsigned VF) {
+    // By default the maximum call interleave factor is the target's
+    // maximum interleave factor (i.e. calls have no affect).
+    return static_cast<T *>(this)->getMaxInterleaveFactor(VF);
+  }
+
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the result needs to be inserted and/or extracted from vectors.
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -332,6 +332,25 @@
   return TTIImpl->getNumberOfRegisters(Vector);
 }
 
+bool TargetTransformInfo::hasNumberOfCallPreservedRegisters(bool Vector) const {
+  return TTIImpl->hasNumberOfCallPreservedRegisters(Vector);
+}
+
+unsigned
+TargetTransformInfo::getNumberOfCallPreservedRegisters(bool Vector) const {
+  assert(
+      TTIImpl->hasNumberOfCallPreservedRegisters(Vector) &&
+      "Shouldn't be called when hasNumberOfPreservedRegisters() returns false");
+  return TTIImpl->getNumberOfCallPreservedRegisters(Vector);
+}
+
+unsigned TargetTransformInfo::getMaxCallInterleaveFactor(unsigned VF) const {
+  assert(
+      !TTIImpl->hasNumberOfCallPreservedRegisters(VF > 1) &&
+      "Shouldn't be called when hasNumberOfPreservedRegisters() returns true");
+  return TTIImpl->getMaxCallInterleaveFactor(VF);
+}
+
 unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
   return TTIImpl->getRegisterBitWidth(Vector);
 }
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -62,6 +62,8 @@
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
   unsigned getMaxInterleaveFactor(unsigned VF);
+  bool hasNumberOfCallPreservedRegisters(bool Vector);
+  unsigned getNumberOfCallPreservedRegisters(bool Vector);
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -169,6 +169,31 @@
   return 2;
 }
 
+bool X86TTIImpl::hasNumberOfCallPreservedRegisters(bool Vector) {
+  if (!Vector || !ST->is64Bit())
+    return false;
+
+  // Win64 ABI preserves XMM8-XMM15. If the target does not support
+  // AVX, then a vector value must be in a XMM register. Otherwise
+  // the value could be in a YMM or ZMM register (in which case it
+  // will not be preserved).
+  if (ST->isOSWindows() && !ST->hasAVX())
+    return true;
+
+  // If the target uses the System V ABI, no vector registers are
+  // preserved.
+  return (ST->isTargetDarwin() || ST->isTargetLinux() ||
+          ST->isTargetSolaris() || ST->isTargetKFreeBSD() ||
+          ST->isTargetPS4());
+}
+
+unsigned X86TTIImpl::getNumberOfCallPreservedRegisters(bool Vector) {
+  if (ST->isOSWindows())
+    return 8;
+
+  return 0;
+}
+
 int X86TTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty,
     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1227,11 +1227,18 @@
 
     /// Holds the maximum number of concurrent live intervals in the loop.
     unsigned MaxLocalUsers;
+
+    /// Holds the maximum number of live registers at a callsite in the loop.
+    unsigned MaxCallUsers;
+
+    /// Holds the maximum interleave factor for a call in the loop.
+    unsigned MaxCallIC;
   };
 
   /// \return Returns information about the register usages of the loop for the
   /// given vectorization factors.
-  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
+  SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs,
+                                                     bool handleCalls = false);
 
   /// Collect values we want to ignore in the cost model.
   void collectValuesToIgnore();
@@ -3193,6 +3200,7 @@
   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
 }
+
 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
@@ -5169,7 +5177,7 @@
       TargetNumRegisters = ForceTargetNumVectorRegs;
   }
 
-  RegisterUsage R = calculateRegisterUsage({VF})[0];
+  RegisterUsage R = calculateRegisterUsage({VF}, true)[0];
   // We divide by these constants so assume that we have at least one
   // instruction that uses at least one register.
   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
@@ -5190,6 +5198,12 @@
     IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
                        std::max(1U, (R.MaxLocalUsers - 1)));
 
+  // If the loop contains a call with live values at the callsite, use an IC
+  // no greater than the maximum call IC (rounded down to a power of 2). An
+  // IC above this will cause additional spilling.
+  if (R.MaxCallUsers > 0)
+    IC = std::min(IC, (unsigned)PowerOf2Floor(R.MaxCallIC));
+
   // Clamp the interleave ranges to reasonable counts.
   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
 
@@ -5278,7 +5292,8 @@
 }
 
 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
-LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
+LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs,
+                                                   bool handleCalls) {
   // This function calculates the register usage by measuring the highest number
   // of values that are alive at a single location. Obviously, this is a very
   // rough estimation. We scan the loop in a topological order in order and
@@ -5362,6 +5377,8 @@
 
   SmallVector<RegisterUsage, 8> RUs(VFs.size());
   SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
+  SmallVector<unsigned, 8> MaxCallUsages(VFs.size(), 0);
+  SmallVector<unsigned, 8> MaxCallIC(VFs.size(), -1);
 
   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
 
@@ -5391,21 +5408,81 @@
 
     // For each VF find the maximum usage of registers.
     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
-      if (VFs[j] == 1) {
-        MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
-        continue;
-      }
-      collectUniformsAndScalars(VFs[j]);
-      // Count the number of live intervals.
       unsigned RegUsage = 0;
-      for (auto Inst : OpenIntervals) {
-        // Skip ignored values for VF > 1.
-        if (VecValuesToIgnore.count(Inst) ||
-            isScalarAfterVectorization(Inst, VFs[j]))
-          continue;
-        RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+      if (VFs[j] == 1)
+        RegUsage = OpenIntervals.size();
+      else {
+        collectUniformsAndScalars(VFs[j]);
+        // Count the number of live intervals.
+        for (auto Inst : OpenIntervals) {
+          // Skip ignored values for VF > 1.
+          if (VecValuesToIgnore.count(Inst) ||
+              isScalarAfterVectorization(Inst, VFs[j]))
+            continue;
+          RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+        }
       }
       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
+
+      // If the instruction is a call calculate the register pressure.
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        if (!handleCalls)
+          continue;
+
+        MaxCallUsages[j] = std::max(MaxCallUsages[j], RegUsage);
+
+        if (!TTI.hasNumberOfCallPreservedRegisters(VFs[j] > 1)) {
+          MaxCallIC[j] = TTI.getMaxCallInterleaveFactor(VFs[j]);
+          continue;
+        }
+
+        if (RegUsage == 0)
+          continue;
+
+        Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+        bool NeedToScalarize;
+        unsigned CallCost = getVectorCallCost(CI, VFs[j], TTI, TLI,
+            NeedToScalarize);
+
+        // If the vector intrinsic is cheaper than the vector call
+        // assume it will be expanded, leaving no call (if it is the
+        // same cost it is likely to be scalarized).
+        if (ID && getVectorIntrinsicCost(CI, VFs[j], TTI, TLI) < CallCost)
+          continue;
+
+        // Calculate the number of live values that are not used after the
+        // call. This is the number of values that "end" at the next
+        // instruction.
+        unsigned DeadAfterCall = 0;
+
+        if ((i + 1) == Index)
+          DeadAfterCall = RegUsage;
+        else {
+          InstrList &List = TransposeEnds[i + 1];
+          if (VFs[j] == 1)
+            DeadAfterCall = List.size();
+          else
+            for (Instruction *Inst : List)
+              DeadAfterCall += GetRegUsage(Inst->getType(), VFs[j]);
+        }
+
+        unsigned RetUsage = VFs[j] == 1 ? 1 : GetRegUsage(I->getType(), VFs[j]);
+
+        // The interleave factor for the call is the number of times the call
+        // and live values at the callsite can be cloned without causing
+        // additional spilling. For the live values, the number is simply
+        // the number of preserved registers across the call, divided by the
+        // number of live values. However, the calculation must also take into
+        // account the additional registers needed for the return values of
+        // the cloned calls (the return value for the last call will not need
+        // to be preserved). Similarly, some of the live values may not be
+        // used after the call (i.e. call arguments). When the additional
+        // registers for the return equals the number of registers dead after
+        // the call they cancel each other out.
+        unsigned PR = TTI.getNumberOfCallPreservedRegisters(VFs[j] > 1);
+        unsigned Diff = DeadAfterCall > RetUsage ? 0 : RetUsage - DeadAfterCall;
+        MaxCallIC[j] = std::min(MaxCallIC[j], (PR + Diff) / (RegUsage + Diff));
+      }
     }
 
     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
@@ -5431,6 +5508,16 @@
 
     RU.LoopInvariantRegs = Invariant;
     RU.MaxLocalUsers = MaxUsages[i];
+
+    if (handleCalls) {
+      LLVM_DEBUG(dbgs() << "LV(REG): Found max call usage: "
+                        << MaxCallUsages[i] << '\n');
+      LLVM_DEBUG(dbgs() << "LV(REG): Found max call IC: "
+                        << MaxCallIC[i] << '\n');
+      RU.MaxCallUsers = MaxCallUsages[i];
+      RU.MaxCallIC = MaxCallIC[i];
+    }
+
     RUs[i] = RU;
   }
 
Index: test/Transforms/LoopVectorize/X86/interleaving-veclib-call.ll
===================================================================
--- test/Transforms/LoopVectorize/X86/interleaving-veclib-call.ll
+++ test/Transforms/LoopVectorize/X86/interleaving-veclib-call.ll
@@ -0,0 +1,101 @@
+; RUN: opt -S -mtriple=x86_64-unknown-linux -mcpu=btver2 -vector-library=SVML -loop-vectorize < %s | FileCheck %s
+
+; This test checks that when a call is vectorized with a vector library call
+; the interleave count used is 1 (i.e. the call appears only once).  As loops
+; with reductions are treated specially by the cost model we also test this
+; case. Finally, a test is included that checks that no restriction of the
+; interleave count is done when the call is not vectorized to a vector library
+; call.
+
+; CHECK-LABEL: test
+; CHECK: call <8 x float> @__svml_sinf8
+; CHECK-NOT: call <8 x float> @__svml_sinf8
+
+define void @sinf-test(float* nocapture readonly %a, float* nocapture %b, i32 %n) {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @sinf(float %0)
+  %arrayidx2 = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+declare dso_local float @sinf(float) local_unnamed_addr
+
+; CHECK-LABEL: sinf-reduc-test
+; CHECK: call fast <8 x float> @__svml_sinf8
+; CHECK-NOT: call fast <8 x float> @__svml_sinf8
+
+define float @sinf-reduc-test(float* nocapture readonly %a, i32 %n) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %s.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  ret float %s.0.lcssa
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %s.07 = phi float [ 0.000000e+00, %for.body.preheader ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %1 = tail call fast float @llvm.sin.f32(float %0)
+  %add = fadd fast float %1, %s.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+declare float @llvm.sin.f32(float)
+
+; CHECK-LABEL: ceilf-test
+; CHECK: call <8 x float> @llvm.ceil.v8f32
+; CHECK: call <8 x float> @llvm.ceil.v8f32
+; CHECK: call <8 x float> @llvm.ceil.v8f32
+; CHECK: call <8 x float> @llvm.ceil.v8f32
+
+define void @ceilf-test(float* nocapture readonly %a, float* nocapture %b, i32 %n) {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %1 = tail call float @llvm.ceil.f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  store float %1, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+declare float @llvm.ceil.f32(float)