diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -94,8 +94,11 @@
   bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R);
 
   /// Try to vectorize a list of operands.
+  /// \param LimitForRegisterSize Vectorize only using maximal allowed register
+  /// size.
   /// \returns true if a value was vectorized.
-  bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R);
+  bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
+                          bool LimitForRegisterSize = false);
 
   /// Try to vectorize a chain that may start at the operands of \p I.
   bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7665,7 +7665,8 @@
   return tryToVectorizeList(VL, R);
 }
 
-bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
+bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+                                           bool LimitForRegisterSize) {
   if (VL.size() < 2)
     return false;
 
@@ -7737,7 +7738,8 @@
       if (!isPowerOf2_32(OpsWidth))
         continue;
 
-      if ((VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2))
+      if ((LimitForRegisterSize && OpsWidth < MaxVF) ||
+          (VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2))
         break;
 
       ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
@@ -9087,21 +9089,49 @@
       // So allow tryToVectorizeList to reorder them if it is beneficial. This
       // is done when there are exactly two elements since tryToVectorizeList
       // asserts that there are only two values when AllowReorder is true.
-      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
+      // The vectorization is a 3-state attempt:
+      // 1. Try to vectorize PHIs with the same/alternate opcodes with the size
+      // of maximal register at first.
+      // 2. Try to vectorize remaining PHIs with the same type, if possible.
+      // This may result in the better vectorization results rather than if we
+      // try just to vectorize PHIs with the same/alternate opcodes.
+      // 3. Final attempt to try to vectorize all PHIs with the same/alternate
+      // ops only, this may result in some extra final vectorization.
+      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
+                                            /*LimitForRegisterSize=*/true)) {
         // Success start over because instructions might have been changed.
         HaveVectorizedPhiNodes = true;
         Changed = true;
-      } else if (NumElts < 4 &&
+      } else if (NumElts * R.getVectorElementSize(*IncIt) <
+                     R.getMaxVecRegSize() &&
                  (Candidates.empty() ||
                   Candidates.front()->getType() == (*IncIt)->getType())) {
         Candidates.append(IncIt, std::next(IncIt, NumElts));
       }
       // Final attempt to vectorize phis with the same types.
-      if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) {
-        if (Candidates.size() > 1 && tryToVectorizeList(Candidates, R)) {
+      if (Candidates.size() > 1 &&
+          (SameTypeIt == E ||
+           (*SameTypeIt)->getType() != (*IncIt)->getType())) {
+        if (tryToVectorizeList(Candidates, R)) {
           // Success start over because instructions might have been changed.
           HaveVectorizedPhiNodes = true;
           Changed = true;
+        } else {
+          // Try to vectorize using small vectors.
+          for (SmallVector<Value *, 4>::iterator It = Candidates.begin(),
+                                                 End = Candidates.end();
+               It != End;) {
+            SmallVector<Value *, 4>::iterator SameTypeIt = It;
+            while (SameTypeIt != End && AreCompatiblePHIs(*SameTypeIt, *It))
+              ++SameTypeIt;
+            unsigned NumElts = (SameTypeIt - It);
+            if (NumElts > 1 &&
+                tryToVectorizeList(makeArrayRef(It, NumElts), R)) {
+              HaveVectorizedPhiNodes = true;
+              Changed = true;
+            }
+            It = SameTypeIt;
+          }
         }
         Candidates.clear();
       }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
@@ -8,33 +8,32 @@
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub float 6.553500e+04, undef
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float [[SUB]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[CONV]], i32 1
 ; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ [[SUB]], [[BB1]] ], [ [[TMP9:%.*]], [[BB3:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[CONV]], [[BB1]] ], [ [[TMP10:%.*]], [[BB3]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ undef, [[BB1]] ], [ [[TMP11:%.*]], [[BB3]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP18:%.*]], [[BB3:%.*]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* undef, align 8
 ; CHECK-NEXT:    br i1 undef, label [[BB3]], label [[BB4:%.*]]
 ; CHECK:       bb4:
-; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
-; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <2 x double> undef, [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x float> [[TMP2]], <2 x float> undef
-; CHECK-NEXT:    [[EXT3:%.*]] = fpext float [[TMP1]] to double
 ; CHECK-NEXT:    [[CONV2:%.*]] = uitofp i16 undef to double
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[TMP3]], [[CONV2]]
-; CHECK-NEXT:    [[CMP3:%.*]] = fcmp ogt double [[ADD1]], [[EXT3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fptrunc double [[ADD1]] to float
-; CHECK-NEXT:    [[SEL3:%.*]] = select i1 [[CMP3]], float [[TMP1]], float [[TMP7]]
-; CHECK-NEXT:    [[EXT4:%.*]] = fpext float [[TMP0]] to double
-; CHECK-NEXT:    [[SUB1:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[CMP4:%.*]] = fcmp ogt double [[SUB1]], [[EXT4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fptrunc double [[SUB1]] to float
-; CHECK-NEXT:    [[SEL4:%.*]] = select i1 [[CMP4]], float [[TMP0]], float [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[CONV2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> poison, double [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = fcmp ogt <4 x double> [[TMP13]], [[TMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP16:%.*]] = fptrunc <4 x double> [[TMP15]] to <4 x float>
+; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP2]], <4 x float> [[TMP16]]
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP9]] = phi float [ [[SEL4]], [[BB4]] ], [ [[TMP0]], [[BB2]] ]
-; CHECK-NEXT:    [[TMP10]] = phi float [ [[SEL3]], [[BB4]] ], [ [[TMP1]], [[BB2]] ]
-; CHECK-NEXT:    [[TMP11]] = phi <2 x float> [ [[TMP6]], [[BB4]] ], [ [[TMP2]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP18]] = phi <4 x float> [ [[TMP17]], [[BB4]] ], [ [[TMP2]], [[BB2]] ]
 ; CHECK-NEXT:    br label [[BB2]]
 ;
 entry: