Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -794,10 +794,20 @@
   /// Additional properties of an operand's values.
   enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };
 
+  /// \return True if float type registers overlap with integer type.
+  /// In most targets, scalar float type registers don't overlap with
+  /// integer type fully, but vector float type registers do because
+  /// both float and integer vector type reside in same registers normally.
+  /// Now we assume the target float register number is same as integer so
+  /// the overlapping is fully not partially.
+  bool isFloatOverlapIntRegs(bool Vector) const;
+
   /// \return The number of scalar or vector registers that the target has.
   /// If 'Vectors' is true, it returns the number of vector registers. If it is
   /// set to false, it returns the number of scalar registers.
-  unsigned getNumberOfRegisters(bool Vector) const;
+  /// If 'IsFloatTy' is used, it can return different register numbers
+  /// for float and integer registers
+  unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy = false) const;
 
   /// \return The width of the largest scalar or vector register type.
   unsigned getRegisterBitWidth(bool Vector) const;
@@ -1251,7 +1261,8 @@
                             Type *Ty) = 0;
   virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                             Type *Ty) = 0;
-  virtual unsigned getNumberOfRegisters(bool Vector) = 0;
+  virtual unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy = false) = 0;
+  virtual bool isFloatOverlapIntRegs(bool Vector) = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
   virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
@@ -1596,8 +1607,11 @@
                     Type *Ty) override {
     return Impl.getIntImmCost(IID, Idx, Imm, Ty);
   }
-  unsigned getNumberOfRegisters(bool Vector) override {
-    return Impl.getNumberOfRegisters(Vector);
+  unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy = false) override {
+    return Impl.getNumberOfRegisters(Vector, IsFloatTy);
+  }
+  virtual bool isFloatOverlapIntRegs(bool Vector) override {
+    return Impl.isFloatOverlapIntRegs(Vector);
   }
   unsigned getRegisterBitWidth(bool Vector) const override {
     return Impl.getRegisterBitWidth(Vector);
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -358,7 +358,11 @@
     return TTI::TCC_Free;
   }
 
-  unsigned getNumberOfRegisters(bool Vector) { return 8; }
+  bool isFloatOverlapIntRegs(bool Vector) {
+    return Vector;
+  }
+
+  unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy) { return 8; }
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
 
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -519,7 +519,9 @@
   /// \name Vector TTI Implementations
   /// @{
 
-  unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; }
+  unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy) {
+    return Vector ? 0 : 1;
+  }
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
 
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -480,8 +480,13 @@
   return Cost;
 }
 
-unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {
-  return TTIImpl->getNumberOfRegisters(Vector);
+bool TargetTransformInfo::isFloatOverlapIntRegs(bool Vector) const {
+  return TTIImpl->isFloatOverlapIntRegs(Vector);
+}
+
+unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector,
+                                                   bool IsFloatTy) const {
+  return TTIImpl->getNumberOfRegisters(Vector, IsFloatTy);
 }
 
 unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -85,7 +85,7 @@
 
   bool enableInterleavedAccessVectorization() { return true; }
 
-  unsigned getNumberOfRegisters(bool Vector) {
+  unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy) {
     if (Vector) {
       if (ST->hasNEON())
         return 32;
Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -136,7 +136,7 @@
   }
 
   unsigned getHardwareNumberOfRegisters(bool Vector) const;
-  unsigned getNumberOfRegisters(bool Vector) const;
+  unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy) const;
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getMinVectorRegisterBitWidth() const;
   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -231,7 +231,7 @@
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
   unsigned getHardwareNumberOfRegisters(bool Vec) const;
-  unsigned getNumberOfRegisters(bool Vec) const;
+  unsigned getNumberOfRegisters(bool Vec, bool IsFloatTy) const;
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getMinVectorRegisterBitWidth() const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -218,7 +218,7 @@
   return 256;
 }
 
-unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
+unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec, bool IsFloatTy) const {
   // This is really the number of registers to fill when vectorizing /
   // interleaving loops, so we lie to avoid trying to use all registers.
   return getHardwareNumberOfRegisters(Vec) >> 3;
@@ -682,7 +682,7 @@
   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
 }
 
-unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
+unsigned R600TTIImpl::getNumberOfRegisters(bool Vec, bool IsFloatTy) const {
   return getHardwareNumberOfRegisters(Vec);
 }
 
Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -122,7 +122,7 @@
   /// \name Vector TTI Implementations
   /// @{
 
-  unsigned getNumberOfRegisters(bool Vector) {
+  unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy) {
     if (Vector) {
       if (ST->hasNEON())
         return 16;
Index: llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -76,7 +76,7 @@
   /// \name Vector TTI Implementations
   /// @{
 
-  unsigned getNumberOfRegisters(bool vector) const;
+  unsigned getNumberOfRegisters(bool vector, bool IsFloatTy) const;
   unsigned getMaxInterleaveFactor(unsigned VF);
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getMinVectorRegisterBitWidth() const;
Index: llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -91,7 +91,7 @@
 
 /// --- Vector TTI begin ---
 
-unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const {
+unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector, bool IsFloatTy) const {
   if (Vector)
     return useHVX() ? 32 : 0;
   return 32;
Index: llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -66,7 +66,7 @@
   // vectorizers but disables heuristics based on the number of registers.
   // FIXME: Return a more reasonable number, while keeping an eye on
   // LoopVectorizer's unrolling heuristics.
-  unsigned getNumberOfRegisters(bool Vector) const { return 1; }
+  unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy) const { return 1; }
 
   // Only <2 x half> should be vectorized, so always return 32 for the vector
   // register size.
Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -72,7 +72,7 @@
   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                     bool IsZeroCmp) const;
   bool enableInterleavedAccessVectorization();
-  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy);
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getCacheLineSize();
   unsigned getPrefetchDistance();
Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -594,10 +594,14 @@
   return true;
 }
 
-unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
+unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector, bool IsFloatTy) {
   if (Vector && !ST->hasAltivec() && !ST->hasQPX())
     return 0;
-  return ST->hasVSX() ? 64 : 32;
+  if (Vector)
+    return ST->hasVSX() ? 64 : 32;
+
+  // For scalar type
+  return ST->hasVSX() && IsFloatTy ? 64 : 32;
 }
 
 unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const {
Index: llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -56,7 +56,7 @@
   /// \name Vector TTI Implementations
   /// @{
 
-  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy);
   unsigned getRegisterBitWidth(bool Vector) const;
 
   unsigned getCacheLineSize() { return 256; }
Index: llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -304,7 +304,7 @@
              C2.ScaleCost, C2.SetupCost);
 }
 
-unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) {
+unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector, bool IsFloatTy) {
   if (!Vector)
     // Discount the stack pointer.  Also leave out %r0, since it can't
     // be used in an address.
Index: llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -53,7 +53,7 @@
   /// \name Vector TTI Implementations
   /// @{
 
-  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy);
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
Index: llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -25,8 +25,8 @@
   return TargetTransformInfo::PSK_FastHardware;
 }
 
-unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector) {
-  unsigned Result = BaseT::getNumberOfRegisters(Vector);
+unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector, bool IsFloatTy) {
+  unsigned Result = BaseT::getNumberOfRegisters(Vector, IsFloatTy);
 
   // For SIMD, use at least 16 registers, as a rough guess.
   if (Vector)
Index: llvm/lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -115,7 +115,7 @@
   /// \name Vector TTI Implementations
   /// @{
 
-  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy);
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
   unsigned getMaxInterleaveFactor(unsigned VF);
Index: llvm/lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -118,7 +118,7 @@
   llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
 }
 
-unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
+unsigned X86TTIImpl::getNumberOfRegisters(bool Vector, bool IsFloatTy) {
   if (Vector && !ST->hasSSE1())
     return 0;
 
Index: llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
+++ llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
@@ -40,7 +40,7 @@
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
         TLI(ST->getTargetLowering()) {}
 
-  unsigned getNumberOfRegisters(bool Vector) {
+  unsigned getNumberOfRegisters(bool Vector, bool IsFloatTy) {
     if (Vector) {
       return 0;
     }
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -204,12 +204,18 @@
 /// number.
 static const unsigned TinyTripCountInterleaveThreshold = 128;
 
-static cl::opt<unsigned> ForceTargetNumScalarRegs(
-    "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
+static cl::opt<unsigned> ForceTargetNumIntScalarRegs(
+    "force-target-num-int-scalar-regs", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's number of scalar registers."));
+static cl::opt<unsigned> ForceTargetNumFloatScalarRegs(
+    "force-target-num-float-scalar-regs", cl::init(0), cl::Hidden,
     cl::desc("A flag that overrides the target's number of scalar registers."));
 
-static cl::opt<unsigned> ForceTargetNumVectorRegs(
-    "force-target-num-vector-regs", cl::init(0), cl::Hidden,
+static cl::opt<unsigned> ForceTargetNumIntVectorRegs(
+    "force-target-num-int-vector-regs", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's number of vector registers."));
+static cl::opt<unsigned> ForceTargetNumFloatVectorRegs(
+    "force-target-num-float-vector-regs", cl::init(0), cl::Hidden,
     cl::desc("A flag that overrides the target's number of vector registers."));
 
 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
@@ -982,11 +988,14 @@
   /// A struct that represents some properties of the register usage
   /// of a loop.
   struct RegisterUsage {
-    /// Holds the number of loop invariant values that are used in the loop.
-    unsigned LoopInvariantRegs;
-
-    /// Holds the maximum number of concurrent live intervals in the loop.
-    unsigned MaxLocalUsers;
+    /// Holds the number of integer loop invariant values that are used in the loop.
+    unsigned IntLoopInvariantRegs;
+    /// Holds the maximum number of integer concurrent live intervals in the loop.
+    unsigned IntMaxLocalUsers;
+    /// Holds the number of float loop invariant values that are used in the loop.
+    unsigned FloatLoopInvariantRegs;
+    /// Holds the maximum number of float concurrent live intervals in the loop.
+    unsigned FloatMaxLocalUsers;
   };
 
   /// \return Returns information about the register usages of the loop for the
@@ -4928,11 +4937,28 @@
 
     // Select the largest VF which doesn't require more registers than existing
     // ones.
-    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
+    unsigned TargetIntNumRegisters = TTI.getNumberOfRegisters(true);
+    unsigned TargetFloatNumRegisters = TTI.getNumberOfRegisters(true, true);
+    bool Overlap = TTI.isFloatOverlapIntRegs(true);
+
     for (int i = RUs.size() - 1; i >= 0; --i) {
-      if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
-        MaxVF = VFs[i];
-        break;
+      if (Overlap) {
+        // We assume target float register number is same as integer register
+        // if Overlap is true. So it's fully overlapping and we can choose either
+        // one as target register number.
+        unsigned TargetNumRegisters = TargetFloatNumRegisters;
+
+        if (RUs[i].IntMaxLocalUsers + RUs[i].FloatMaxLocalUsers <=
+            TargetNumRegisters) {
+          MaxVF = VFs[i];
+          break;
+        }
+      } else {
+        if (RUs[i].IntMaxLocalUsers <= TargetIntNumRegisters &&
+            RUs[i].FloatMaxLocalUsers <= TargetFloatNumRegisters) {
+          MaxVF = VFs[i];
+          break;
+        }
       }
     }
     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
@@ -5081,22 +5107,29 @@
   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
     return 1;
 
-  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
-  LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
-                    << " registers\n");
+  unsigned TargetIntNumRegisters = TTI.getNumberOfRegisters(VF > 1);
+  unsigned TargetFloatNumRegisters = TTI.getNumberOfRegisters(VF > 1, true);
+  LLVM_DEBUG(dbgs() << "LV: The target has " << TargetIntNumRegisters
+                    << " integer registers, " << TargetFloatNumRegisters
+                    << " float registers." << '\n');
 
   if (VF == 1) {
-    if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
-      TargetNumRegisters = ForceTargetNumScalarRegs;
+    if (ForceTargetNumIntScalarRegs.getNumOccurrences() > 0)
+      TargetIntNumRegisters = ForceTargetNumIntScalarRegs;
+    if (ForceTargetNumFloatScalarRegs.getNumOccurrences() > 0)
+      TargetFloatNumRegisters = ForceTargetNumFloatScalarRegs;
   } else {
-    if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
-      TargetNumRegisters = ForceTargetNumVectorRegs;
+    if (ForceTargetNumIntVectorRegs.getNumOccurrences() > 0)
+      TargetIntNumRegisters = ForceTargetNumIntVectorRegs;
+    if (ForceTargetNumFloatVectorRegs.getNumOccurrences() > 0)
+      TargetFloatNumRegisters = ForceTargetNumFloatVectorRegs;
   }
 
   RegisterUsage R = calculateRegisterUsage({VF})[0];
   // We divide by these constants so assume that we have at least one
   // instruction that uses at least one register.
-  R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
+  R.IntMaxLocalUsers = std::max(R.IntMaxLocalUsers, 1U);
+  R.FloatMaxLocalUsers = std::max(R.FloatMaxLocalUsers, 1U);
 
   // We calculate the interleave count using the following formula.
   // Subtract the number of loop invariants from the number of available
@@ -5109,13 +5142,43 @@
   // We also want power of two interleave counts to ensure that the induction
   // variable of the vector loop wraps to zero, when tail is folded by masking;
   // this currently happens when OptForSize, in which case IC is set to 1 above.
-  unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
-                              R.MaxLocalUsers);
+  unsigned IC = 1;
+
+  bool Overlap = TTI.isFloatOverlapIntRegs(VF > 1);
+
+  if (!Overlap) {
+    unsigned IntIC = PowerOf2Floor(
+        (TargetIntNumRegisters - R.IntLoopInvariantRegs) / R.IntMaxLocalUsers);
+    unsigned FloatIC =
+        PowerOf2Floor((TargetFloatNumRegisters - R.FloatLoopInvariantRegs) /
+                      R.FloatMaxLocalUsers);
+    // Don't count the induction variable as interleaved.
+    if (EnableIndVarRegisterHeur) {
+      IntIC =
+          PowerOf2Floor((TargetIntNumRegisters - R.IntLoopInvariantRegs - 1) /
+                        std::max(1U, (R.IntMaxLocalUsers - 1)));
+      FloatIC = PowerOf2Floor(
+          (TargetFloatNumRegisters - R.FloatLoopInvariantRegs - 1) /
+          std::max(1U, (R.FloatMaxLocalUsers - 1)));
+    }
 
-  // Don't count the induction variable as interleaved.
-  if (EnableIndVarRegisterHeur)
-    IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
-                       std::max(1U, (R.MaxLocalUsers - 1)));
+    IC = std::min(IntIC, FloatIC);
+
+  } else {
+    // We assume target float register number is same as integer register
+    // if Overlap is true. So it's fully overlapping and we can choose either
+    // one as target register number.
+    unsigned TargetNumRegisters = TargetFloatNumRegisters;
+    unsigned LoopInvariantRegs =
+        R.IntLoopInvariantRegs + R.FloatLoopInvariantRegs;
+    unsigned MaxLocalUsers = R.IntMaxLocalUsers + R.FloatMaxLocalUsers;
+
+    IC =
+        PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
+    if (EnableIndVarRegisterHeur)
+      IC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
+                         std::max(1U, (MaxLocalUsers - 1)));
+  }
 
   // Clamp the interleave ranges to reasonable counts.
   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
@@ -5297,7 +5360,8 @@
   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
 
   SmallVector<RegisterUsage, 8> RUs(VFs.size());
-  SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
+  SmallVector<unsigned, 8> IntMaxUsages(VFs.size(), 0);
+  SmallVector<unsigned, 8> FloatMaxUsages(VFs.size(), 0);
 
   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
 
@@ -5327,21 +5391,33 @@
 
     // For each VF find the maximum usage of registers.
     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
-      if (VFs[j] == 1) {
-        MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
-        continue;
-      }
-      collectUniformsAndScalars(VFs[j]);
       // Count the number of live intervals.
-      unsigned RegUsage = 0;
-      for (auto Inst : OpenIntervals) {
-        // Skip ignored values for VF > 1.
-        if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
-            isScalarAfterVectorization(Inst, VFs[j]))
-          continue;
-        RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+      unsigned IntRegUsage = 0;
+      unsigned FloatRegUsage = 0;
+
+      if (VFs[j] == 1) {
+        for (auto Inst : OpenIntervals) {
+          if (Inst->getType()->isFloatTy())
+            FloatRegUsage += 1;
+          else
+            IntRegUsage += 1;
+        }
+      } else {
+        collectUniformsAndScalars(VFs[j]);
+        for (auto Inst : OpenIntervals) {
+          // Skip ignored values for VF > 1.
+          if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
+              (isScalarAfterVectorization(Inst, VFs[j])))
+            continue;
+          if (Inst->getType()->isFloatTy())
+            FloatRegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+          else
+            IntRegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+        }
       }
-      MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
+
+      IntMaxUsages[j] = std::max(IntMaxUsages[j], IntRegUsage);
+      FloatMaxUsages[j] = std::max(FloatMaxUsages[j], FloatRegUsage);
     }
 
     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
@@ -5352,21 +5428,27 @@
   }
 
   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
-    unsigned Invariant = 0;
-    if (VFs[i] == 1)
-      Invariant = LoopInvariants.size();
-    else {
-      for (auto Inst : LoopInvariants)
-        Invariant += GetRegUsage(Inst->getType(), VFs[i]);
+    unsigned IntInvariant = 0;
+    unsigned FloatInvariant = 0;
+
+    for (auto Inst : LoopInvariants) {
+      unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
+      if (Inst->getType()->isFloatTy())
+        FloatInvariant += Usage;
+      else
+        IntInvariant += Usage;
     }
 
     LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
-    LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
-    LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
-                      << '\n');
-
-    RU.LoopInvariantRegs = Invariant;
-    RU.MaxLocalUsers = MaxUsages[i];
+    LLVM_DEBUG(dbgs() << "LV(REG): Found max int usage: " << IntMaxUsages[i]
+                      << ", max float usage: " << FloatMaxUsages[i] << '\n');
+    LLVM_DEBUG(dbgs() << "LV(REG): Found int invariant usage: " << IntInvariant
+                      << ", float invariant usage: " << FloatInvariant << '\n');
+
+    RU.IntLoopInvariantRegs = IntInvariant;
+    RU.FloatLoopInvariantRegs = FloatInvariant;
+    RU.IntMaxLocalUsers = IntMaxUsages[i];
+    RU.FloatMaxLocalUsers = FloatMaxUsages[i];
     RUs[i] = RU;
   }
 
Index: llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
@@ -0,0 +1,160 @@
+; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=powerpc64-unknown-linux -S -mcpu=pwr8 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR8
+; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=powerpc64le-unknown-linux -S -mcpu=pwr9 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR9
+
+@a = global [1024 x i8] zeroinitializer, align 16
+@b = global [1024 x i8] zeroinitializer, align 16
+
+define i32 @foo() {
+;
+; CHECK-LABEL: foo
+
+; CHECK:      LV(REG): VF = 8
+; CHECK-NEXT: LV(REG): Found max int usage: 7, max float usage: 0
+; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0
+; CHECK:      LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max int usage: 13, max float usage: 0
+; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0
+
+; CHECK-PWR8:      LV(REG): VF = 16
+; CHECK-PWR8-NEXT: LV(REG): Found max int usage: 13, max float usage: 0
+; CHECK-PWR8-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0
+; CHECK-PWR8: Setting best plan to VF=16, UF=4
+
+; CHECK-PWR9:      LV(REG): VF = 8
+; CHECK-PWR9-NEXT: LV(REG): Found max int usage: 7, max float usage: 0
+; CHECK-PWR9-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0
+; CHECK-PWR9: Setting best plan to VF=8, UF=8
+
+
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  %sub = sub nsw i32 %conv, %conv3
+  %ispos = icmp sgt i32 %sub, -1
+  %neg = sub nsw i32 0, %sub
+  %2 = select i1 %ispos, i32 %sub, i32 %neg
+  %add = add nsw i32 %2, %s.015
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define i32 @goo() {
+; For indvars.iv used in a computating chain only feeding into getelementptr or cmp,
+; it will not have vector version and the vector register usage will not exceed the
+; available vector register number.
+; CHECK-LABEL: goo
+; CHECK:      LV(REG): VF = 8
+; CHECK-NEXT: LV(REG): Found max int usage: 7, max float usage: 0
+; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0
+; CHECK:      LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max int usage: 13, max float usage: 0
+; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0
+; CHECK:      LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max int usage: 13, max float usage: 0
+; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0
+
+; CHECK: Setting best plan to VF=16, UF=4
+
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %tmp1 = add nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %tmp1
+  %tmp = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %tmp to i32
+  %tmp2 = add nsw i64 %indvars.iv, 2
+  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %tmp2
+  %tmp3 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %tmp3 to i32
+  %sub = sub nsw i32 %conv, %conv3
+  %ispos = icmp sgt i32 %sub, -1
+  %neg = sub nsw i32 0, %sub
+  %tmp4 = select i1 %ispos, i32 %sub, i32 %neg
+  %add = add nsw i32 %tmp4, %s.015
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define i64 @bar(i64* nocapture %a) {
+; CHECK-LABEL: bar
+; CHECK:      LV(REG): VF = 2
+; CHECK-NEXT: LV(REG): Found max int usage: 3, max float usage: 0
+; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0
+
+; CHECK: Setting best plan to VF=2, UF=12
+
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  %add2.lcssa = phi i64 [ %add2, %for.body ]
+  ret i64 %add2.lcssa
+
+for.body:
+  %i.012 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %s.011 = phi i64 [ 0, %entry ], [ %add2, %for.body ]
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %i.012
+  %0 = load i64, i64* %arrayidx, align 8
+  %add = add nsw i64 %0, %i.012
+  store i64 %add, i64* %arrayidx, align 8
+  %add2 = add nsw i64 %add, %s.011
+  %inc = add nuw nsw i64 %i.012, 1
+  %exitcond = icmp eq i64 %inc, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+@d = external global [0 x i64], align 8
+@e = external global [0 x i32], align 4
+@c = external global [0 x i32], align 4
+
+define void @hoo(i32 %n) {
+; CHECK-LABEL: hoo
+; CHECK:      LV(REG): VF = 4
+; CHECK-NEXT: LV(REG): Found max int usage: 2, max float usage: 0
+; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0
+; CHECK:      LV(REG): VF = 1
+; CHECK-NEXT: LV(REG): Found max int usage: 2, max float usage: 0
+; CHECK-NEXT: LV(REG): Found int invariant usage: 0, float invariant usage: 0
+
+; CHECK: Setting best plan to VF=1, UF=12
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 %indvars.iv
+  %tmp = load i64, i64* %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 %tmp
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 %indvars.iv
+  store i32 %tmp1, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
Index: llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll
+++ llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll
@@ -22,7 +22,7 @@
 target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK: LV: Checking a loop in "test_g"
-; CHECK: LV(REG): Found max usage: 2
+; CHECK: LV(REG): Found max int usage: 2
 
 define i32 @test_g(i32* nocapture readonly %a, i32 %n) local_unnamed_addr !dbg !6 {
 entry:
@@ -60,7 +60,7 @@
 }
 
 ; CHECK: LV: Checking a loop in "test"
-; CHECK: LV(REG): Found max usage: 2
+; CHECK: LV(REG): Found max int usage: 2
 
 define i32 @test(i32* nocapture readonly %a, i32 %n) local_unnamed_addr {
 entry:
Index: llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
+++ llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
@@ -11,9 +11,9 @@
 ;
 ; CHECK-LABEL: foo
 ; CHECK:      LV(REG): VF = 8
-; CHECK-NEXT: LV(REG): Found max usage: 7
+; CHECK-NEXT: LV(REG): Found max int usage: 7
 ; CHECK:      LV(REG): VF = 16
-; CHECK-NEXT: LV(REG): Found max usage: 13
+; CHECK-NEXT: LV(REG): Found max int usage: 13
 
 entry:
   br label %for.body
@@ -47,9 +47,9 @@
 ; available vector register number.
 ; CHECK-LABEL: goo
 ; CHECK:      LV(REG): VF = 8
-; CHECK-NEXT: LV(REG): Found max usage: 7
+; CHECK-NEXT: LV(REG): Found max int usage: 7
 ; CHECK:      LV(REG): VF = 16
-; CHECK-NEXT: LV(REG): Found max usage: 13
+; CHECK-NEXT: LV(REG): Found max int usage: 13
 entry:
   br label %for.body
 
@@ -81,7 +81,7 @@
 define i64 @bar(i64* nocapture %a) {
 ; CHECK-LABEL: bar
 ; CHECK:       LV(REG): VF = 2
-; CHECK:       LV(REG): Found max usage: 3
+; CHECK:       LV(REG): Found max int usage: 3
 ;
 entry:
   br label %for.body
@@ -110,10 +110,10 @@
 define void @hoo(i32 %n) {
 ; For c[i] = e[d[i]] in the loop, e[d[i]] is not consecutive but its index %tmp can
 ; be gathered into a vector. For VF == 16, the vector version of %tmp will be <16 x i64>
-; so the max usage of AVX512 vector register will be 2.
+; so the max int usage of AVX512 vector register will be 2.
 ; AVX512F-LABEL: bar
 ; AVX512F:       LV(REG): VF = 16
-; AVX512F:       LV(REG): Found max usage: 2
+; AVX512F:       LV(REG): Found max int usage: 2
 ;
 entry:
   br label %for.body
Index: llvm/test/Transforms/LoopVectorize/unroll_novec.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/unroll_novec.ll
+++ llvm/test/Transforms/LoopVectorize/unroll_novec.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=1 -force-target-num-scalar-regs=16 -force-target-max-scalar-interleave=8 -force-target-instruction-cost=1 -small-loop-cost=40 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-width=1 -force-target-num-int-scalar-regs=16 -force-target-max-scalar-interleave=8 -force-target-instruction-cost=1 -small-loop-cost=40 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"