diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -94,8 +94,11 @@ bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R); /// Try to vectorize a list of operands. + /// \param LimitForRegisterSize Vectorize only using maximal allowed register + /// size. /// \returns true if a value was vectorized. - bool tryToVectorizeList(ArrayRef VL, slpvectorizer::BoUpSLP &R); + bool tryToVectorizeList(ArrayRef VL, slpvectorizer::BoUpSLP &R, + bool LimitForRegisterSize = false); /// Try to vectorize a chain that may start at the operands of \p I. bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7681,7 +7681,8 @@ return tryToVectorizeList(VL, R); } -bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R) { +bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, + bool LimitForRegisterSize) { if (VL.size() < 2) return false; @@ -7753,7 +7754,8 @@ if (!isPowerOf2_32(OpsWidth)) continue; - if ((VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2)) + if ((LimitForRegisterSize && OpsWidth < MaxVF) || + (VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2)) break; ArrayRef Ops = VL.slice(I, OpsWidth); @@ -9106,21 +9108,49 @@ // So allow tryToVectorizeList to reorder them if it is beneficial. This // is done when there are exactly two elements since tryToVectorizeList // asserts that there are only two values when AllowReorder is true. - if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { + // The vectorization is a 3-state attempt: + // 1. Try to vectorize PHIs with the same/alternate opcodes with the size + // of maximal register at first. + // 2. Try to vectorize remaining PHIs with the same type, if possible. + // This may result in the better vectorization results rather than if we + // try just to vectorize PHIs with the same/alternate opcodes. + // 3. Final attempt to try to vectorize all PHIs with the same/alternate + // ops only, this may result in some extra final vectorization. + if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, + /*LimitForRegisterSize=*/true)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; - } else if (NumElts < 4 && + } else if (NumElts * R.getVectorElementSize(*IncIt) < + R.getMaxVecRegSize() && (Candidates.empty() || Candidates.front()->getType() == (*IncIt)->getType())) { Candidates.append(IncIt, std::next(IncIt, NumElts)); } // Final attempt to vectorize phis with the same types. - if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) { - if (Candidates.size() > 1 && tryToVectorizeList(Candidates, R)) { + if (Candidates.size() > 1 && + (SameTypeIt == E || + (*SameTypeIt)->getType() != (*IncIt)->getType())) { + if (tryToVectorizeList(Candidates, R)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; + } else { + // Try to vectorize using small vectors. + for (SmallVector::iterator It = Candidates.begin(), + End = Candidates.end(); + It != End;) { + SmallVector::iterator SameTypeIt = It; + while (SameTypeIt != End && AreCompatiblePHIs(*SameTypeIt, *It)) + ++SameTypeIt; + unsigned NumElts = (SameTypeIt - It); + if (NumElts > 1 && + tryToVectorizeList(makeArrayRef(It, NumElts), R)) { + HaveVectorizedPhiNodes = true; + Changed = true; + } + It = SameTypeIt; + } } Candidates.clear(); } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll @@ -8,33 +8,32 @@ ; CHECK-NEXT: [[SUB:%.*]] = fsub float 6.553500e+04, undef ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[SUB]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[CONV]], i32 1 ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP0:%.*]] = phi float [ [[SUB]], [[BB1]] ], [ [[TMP9:%.*]], [[BB3:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[CONV]], [[BB1]] ], [ [[TMP10:%.*]], [[BB3]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ undef, [[BB1]] ], [ [[TMP11:%.*]], [[BB3]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP18:%.*]], [[BB3:%.*]] ] ; CHECK-NEXT: [[TMP3:%.*]] = load double, double* undef, align 8 ; CHECK-NEXT: br i1 undef, label [[BB3]], label [[BB4:%.*]] ; CHECK: bb4: -; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double> -; CHECK-NEXT: [[TMP5:%.*]] = fcmp ogt <2 x double> undef, [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x float> [[TMP2]], <2 x float> undef -; CHECK-NEXT: [[EXT3:%.*]] = fpext float [[TMP1]] to double ; CHECK-NEXT: [[CONV2:%.*]] = uitofp i16 undef to double -; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[TMP3]], [[CONV2]] -; CHECK-NEXT: [[CMP3:%.*]] = fcmp ogt double [[ADD1]], [[EXT3]] -; CHECK-NEXT: [[TMP7:%.*]] = fptrunc double [[ADD1]] to float -; CHECK-NEXT: [[SEL3:%.*]] = select i1 [[CMP3]], float [[TMP1]], float [[TMP7]] -; CHECK-NEXT: [[EXT4:%.*]] = fpext float [[TMP0]] to double -; CHECK-NEXT: [[SUB1:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[CMP4:%.*]] = fcmp ogt double [[SUB1]], [[EXT4]] -; CHECK-NEXT: [[TMP8:%.*]] = fptrunc double [[SUB1]] to float -; CHECK-NEXT: [[SEL4:%.*]] = select i1 [[CMP4]], float [[TMP0]], float [[TMP8]] +; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[CONV2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> poison, double [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = fcmp ogt <4 x double> [[TMP13]], [[TMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = fptrunc <4 x double> [[TMP15]] to <4 x float> +; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP2]], <4 x float> [[TMP16]] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP9]] = phi float [ [[SEL4]], [[BB4]] ], [ [[TMP0]], [[BB2]] ] -; CHECK-NEXT: [[TMP10]] = phi float [ [[SEL3]], [[BB4]] ], [ [[TMP1]], [[BB2]] ] -; CHECK-NEXT: [[TMP11]] = phi <2 x float> [ [[TMP6]], [[BB4]] ], [ [[TMP2]], [[BB2]] ] +; CHECK-NEXT: [[TMP18]] = phi <4 x float> [ [[TMP17]], [[BB4]] ], [ [[TMP2]], [[BB2]] ] ; CHECK-NEXT: br label [[BB2]] ; entry: