Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3499,7 +3499,7 @@ /// \returns true if a value was vectorized. bool tryToVectorizeList(ArrayRef VL, BoUpSLP &R, ArrayRef BuildVector = None, - bool allowReorder = false); + bool allowReorder = false, unsigned VecRegSize = 128); /// \brief Try to vectorize a chain that may start at the operands of \V; bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); @@ -3707,12 +3707,20 @@ if (!A || !B) return false; Value *VL[] = { A, B }; - return tryToVectorizeList(VL, R, None, true); + bool SuccessToVectorizeList = false; + for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize; + VecRegSize /= 2) { + if (tryToVectorizeList(VL, R, None, true, VecRegSize)) { + SuccessToVectorizeList = true; + break; + } + } + return SuccessToVectorizeList; } bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, ArrayRef BuildVector, - bool allowReorder) { + bool allowReorder, unsigned VecRegSize) { if (VL.size() < 2) return false; @@ -3724,11 +3732,8 @@ return false; unsigned Opcode0 = I0->getOpcode(); - - // FIXME: Register size should be a parameter to this function, so we can - // try different vectorization factors. unsigned Sz = R.getVectorElementSize(I0); - unsigned VF = MinVecRegSize / Sz; + unsigned VF = VecRegSize / Sz; for (Value *V : VL) { Type *Ty = V->getType(); @@ -3936,7 +3941,8 @@ MinVecRegSize(MinVecRegSize) {} /// \brief Try to find a reduction tree. - bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { + bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, + unsigned VecRegSize) { assert((!Phi || std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && "Thi phi needs to use the binary operator"); @@ -3964,9 +3970,7 @@ const DataLayout &DL = B->getModule()->getDataLayout(); ReductionOpcode = B->getOpcode(); ReducedValueOpcode = 0; - // FIXME: Register size should be a parameter to this function, so we can - // try different vectorization factors. - ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty); + ReduxWidth = VecRegSize / DL.getTypeSizeInBits(Ty); ReductionRoot = B; ReductionPHI = Phi; @@ -4267,7 +4271,7 @@ return false; HorizontalReduction HorRdx(MinRegSize); - if (!HorRdx.matchAssociativeReduction(P, BI)) + if (!HorRdx.matchAssociativeReduction(P, BI, MinRegSize)) return false; // If there is a sufficient number of reduction values, reduce @@ -4318,7 +4322,15 @@ // Try to vectorize them. unsigned NumElts = (SameTypeIt - IncIt); DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); - if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { + bool SuccessToVectorizeList = false; + for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) { + if (tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, None, false, + Size)) { + SuccessToVectorizeList = true; + break; + } + } + if (NumElts > 1 && SuccessToVectorizeList) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; @@ -4354,7 +4366,15 @@ continue; // Try to match and vectorize a horizontal reduction. - if (canMatchHorizontalReduction(P, BI, R, TTI, MinVecRegSize)) { + bool SuccessToMatchHorizontalReduction = false; + for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize; + VecRegSize /= 2) { + if (canMatchHorizontalReduction(P, BI, R, TTI, VecRegSize)) { + SuccessToMatchHorizontalReduction = true; + break; + } + } + if (SuccessToMatchHorizontalReduction) { Changed = true; it = BB->begin(); e = BB->end(); @@ -4381,9 +4401,16 @@ if (StoreInst *SI = dyn_cast(it)) if (BinaryOperator *BinOp = dyn_cast(SI->getValueOperand())) { - if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, - MinVecRegSize) || - tryToVectorize(BinOp, R)) { + bool SuccessToMatchHorizontalReduction = false; + for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize; + VecRegSize /= 2) { + if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, + VecRegSize)) { + SuccessToMatchHorizontalReduction = true; + break; + } + } + if (SuccessToMatchHorizontalReduction || tryToVectorize(BinOp, R)) { Changed = true; it = BB->begin(); e = BB->end(); @@ -4442,12 +4469,16 @@ // Vectorize starting with the build vector operands ignoring the // BuildVector instructions for the purpose of scheduling and user // extraction. - if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { - Changed = true; - it = BB->begin(); - e = BB->end(); + for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize; + VecRegSize /= 2) { + if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, + VecRegSize)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + break; + } } - continue; } } @@ -4532,7 +4563,13 @@ // performed in parallel. It's likely that detecting this pattern in a // bottom-up phase will be simpler and less costly than building a // full-blown top-down phase beginning at the consecutive loads. - Changed |= tryToVectorizeList(Bundle, R); + for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize; + VecRegSize /= 2) { + if (tryToVectorizeList(Bundle, R, None, false, VecRegSize)) { + Changed = true; + break; + } + } } } return Changed; Index: test/Transforms/SLPVectorizer/AArch64/slp-vectorized-from-max-to-min.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/AArch64/slp-vectorized-from-max-to-min.ll @@ -0,0 +1,37 @@ +;RUN: opt -S -slp-vectorizer -slp-max-reg-size=128 -slp-min-reg-size=64 -slp-threshold=-13 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK: @foo +; CHECK: add nsw <2 x i64> +; CHECK: add nsw <2 x i64> + +define i64 @foo(i64* nocapture readonly %a) #0 { +entry: + %idx1 = getelementptr inbounds i64, i64* %a, i64 1 + %idx2 = getelementptr inbounds i64, i64* %a, i64 2 + %idx3 = getelementptr inbounds i64, i64* %a, i64 3 + %idx4 = getelementptr inbounds i64, i64* %a, i64 4 + %idx5 = getelementptr inbounds i64, i64* %a, i64 5 + %idx6 = getelementptr inbounds i64, i64* %a, i64 6 + %idx7 = getelementptr inbounds i64, i64* %a, i64 7 + %0 = load i64, i64* %a, align 4 + %1 = load i64, i64* %idx1, align 4 + %2 = load i64, i64* %idx2, align 4 + %3 = load i64, i64* %idx3, align 4 + %4 = load i64, i64* %idx4, align 4 + %5 = load i64, i64* %idx5, align 4 + %6 = load i64, i64* %idx6, align 4 + %7 = load i64, i64* %idx7, align 4 + %add = add nsw i64 %1, %0 + %add1 = add nsw i64 %3, %2 + %add2 = add nsw i64 %5, %4 + %add3 = add nsw i64 %7, %6 + %add8 = add nsw i64 %add1, %add + %add9 = add nsw i64 %add3, %add2 + %add12 = add nsw i64 %add9, %add8 + ret i64 %add12 +} + +