Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3498,6 +3498,8 @@ /// scheduling and that don't need extracting. /// \returns true if a value was vectorized. bool tryToVectorizeList(ArrayRef VL, BoUpSLP &R, + unsigned VecRegSize = 128, + bool vectorizeStoreChain = false, ArrayRef BuildVector = None, bool allowReorder = false); @@ -3515,11 +3517,7 @@ /// a vectorization chain. bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); - bool vectorizeStoreChain(ArrayRef Chain, int CostThreshold, - BoUpSLP &R, unsigned VecRegSize); - - bool vectorizeStores(ArrayRef Stores, int costThreshold, - BoUpSLP &R); + bool vectorizeStores(ArrayRef Stores, BoUpSLP &R); /// The store instructions in a basic block organized by base pointer. StoreListMap Stores; @@ -3548,56 +3546,7 @@ return !std::equal(VL.begin(), VL.end(), VH.begin()); } -bool SLPVectorizer::vectorizeStoreChain(ArrayRef Chain, - int CostThreshold, BoUpSLP &R, - unsigned VecRegSize) { - unsigned ChainLen = Chain.size(); - DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen - << "\n"); - unsigned Sz = R.getVectorElementSize(Chain[0]); - unsigned VF = VecRegSize / Sz; - - if (!isPowerOf2_32(Sz) || VF < 2) - return false; - - // Keep track of values that were deleted by vectorizing in the loop below. - SmallVector TrackValues(Chain.begin(), Chain.end()); - - bool Changed = false; - // Look for profitable vectorizable trees at all offsets, starting at zero. - for (unsigned i = 0, e = ChainLen; i < e; ++i) { - if (i + VF > e) - break; - - // Check that a previous iteration of this loop did not delete the Value. - if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) - continue; - - DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i - << "\n"); - ArrayRef Operands = Chain.slice(i, VF); - - R.buildTree(Operands); - R.computeMinimumValueSizes(); - - int Cost = R.getTreeCost(); - - DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); - if (Cost < CostThreshold) { - DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); - R.vectorizeTree(); - - // Move to the next bundle. - i += VF - 1; - Changed = true; - } - } - - return Changed; -} - -bool SLPVectorizer::vectorizeStores(ArrayRef Stores, - int costThreshold, BoUpSLP &R) { +bool SLPVectorizer::vectorizeStores(ArrayRef Stores, BoUpSLP &R) { SetVector Heads, Tails; SmallDenseMap ConsecutiveChain; @@ -3652,8 +3601,9 @@ // FIXME: Is division-by-2 the correct step? Should we assert that the // register size is a power-of-2? - for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) { - if (vectorizeStoreChain(Operands, costThreshold, R, Size)) { + for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize; + VecRegSize /= 2) { + if (tryToVectorizeList(Operands, R, VecRegSize, true)) { // Mark the vectorized stores so that we don't vectorize them again. VectorizedStores.insert(Operands.begin(), Operands.end()); Changed = true; @@ -3707,10 +3657,17 @@ if (!A || !B) return false; Value *VL[] = { A, B }; - return tryToVectorizeList(VL, R, None, true); + for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize; + VecRegSize /= 2) { + if (tryToVectorizeList(VL, R, VecRegSize, false, None, true)) + return true; + } + return false; } bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, + unsigned VecRegSize, + bool vectorizeStoreChain, ArrayRef BuildVector, bool allowReorder) { if (VL.size() < 2) @@ -3724,19 +3681,18 @@ return false; unsigned Opcode0 = I0->getOpcode(); - - // FIXME: Register size should be a parameter to this function, so we can - // try different vectorization factors. unsigned Sz = R.getVectorElementSize(I0); - unsigned VF = MinVecRegSize / Sz; + unsigned VF = VecRegSize / Sz; - for (Value *V : VL) { - Type *Ty = V->getType(); - if (!isValidElementType(Ty)) - return false; - Instruction *Inst = dyn_cast(V); - if (!Inst || Inst->getOpcode() != Opcode0) - return false; + if (vectorizeStoreChain == false) { + for (Value *V : VL) { + Type *Ty = V->getType(); + if (!isValidElementType(Ty)) + return false; + Instruction *Inst = dyn_cast(V); + if (!Inst || Inst->getOpcode() != Opcode0) + return false; + } } bool Changed = false; @@ -3745,12 +3701,14 @@ SmallVector TrackValues(VL.begin(), VL.end()); for (unsigned i = 0, e = VL.size(); i < e; ++i) { - unsigned OpsWidth = 0; + unsigned OpsWidth = VF; - if (i + VF > e) - OpsWidth = e - i; - else - OpsWidth = VF; + if (i + VF > e) { + if (vectorizeStoreChain) + break; + else + OpsWidth = e - i; + } if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) break; @@ -3936,7 +3894,8 @@ MinVecRegSize(MinVecRegSize) {} /// \brief Try to find a reduction tree. - bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { + bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, + unsigned VecRegSize) { assert((!Phi || std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && "Thi phi needs to use the binary operator"); @@ -3964,9 +3923,7 @@ const DataLayout &DL = B->getModule()->getDataLayout(); ReductionOpcode = B->getOpcode(); ReducedValueOpcode = 0; - // FIXME: Register size should be a parameter to this function, so we can - // try different vectorization factors. - ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty); + ReduxWidth = VecRegSize / DL.getTypeSizeInBits(Ty); ReductionRoot = B; ReductionPHI = Phi; @@ -4267,7 +4224,7 @@ return false; HorizontalReduction HorRdx(MinRegSize); - if (!HorRdx.matchAssociativeReduction(P, BI)) + if (!HorRdx.matchAssociativeReduction(P, BI, MinRegSize)) return false; // If there is a sufficient number of reduction values, reduce @@ -4318,13 +4275,18 @@ // Try to vectorize them. unsigned NumElts = (SameTypeIt - IncIt); DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); - if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { - // Success start over because instructions might have been changed. - HaveVectorizedPhiNodes = true; - Changed = true; - break; + if (NumElts > 1) { + for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize; + VecRegSize /= 2) { + if (tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, VecRegSize)) { + HaveVectorizedPhiNodes = true; + Changed = true; + break; + } + } + if (Changed) + break; } - // Start over at the next instruction of a different type (or the end). IncIt = SameTypeIt; } @@ -4354,7 +4316,15 @@ continue; // Try to match and vectorize a horizontal reduction. - if (canMatchHorizontalReduction(P, BI, R, TTI, MinVecRegSize)) { + bool SuccessToMatchHorizontalReduction = false; + for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize; + VecRegSize /= 2) { + if (canMatchHorizontalReduction(P, BI, R, TTI, VecRegSize)) { + SuccessToMatchHorizontalReduction = true; + break; + } + } + if (SuccessToMatchHorizontalReduction) { Changed = true; it = BB->begin(); e = BB->end(); @@ -4381,9 +4351,16 @@ if (StoreInst *SI = dyn_cast(it)) if (BinaryOperator *BinOp = dyn_cast(SI->getValueOperand())) { - if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, - MinVecRegSize) || - tryToVectorize(BinOp, R)) { + bool SuccessToMatchHorizontalReduction = false; + for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize; + VecRegSize /= 2) { + if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, + VecRegSize)) { + SuccessToMatchHorizontalReduction = true; + break; + } + } + if (SuccessToMatchHorizontalReduction || tryToVectorize(BinOp, R)) { Changed = true; it = BB->begin(); e = BB->end(); @@ -4442,12 +4419,16 @@ // Vectorize starting with the build vector operands ignoring the // BuildVector instructions for the purpose of scheduling and user // extraction. - if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { - Changed = true; - it = BB->begin(); - e = BB->end(); + for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize; + VecRegSize /= 2) { + if (tryToVectorizeList(BuildVectorOpds, R, VecRegSize, false, + BuildVector, false)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + break; + } } - continue; } } @@ -4532,7 +4513,13 @@ // performed in parallel. It's likely that detecting this pattern in a // bottom-up phase will be simpler and less costly than building a // full-blown top-down phase beginning at the consecutive loads. - Changed |= tryToVectorizeList(Bundle, R); + for (unsigned VecRegSize = MaxVecRegSize; VecRegSize >= MinVecRegSize; + VecRegSize /= 2) { + if (tryToVectorizeList(Bundle, R, VecRegSize)) { + Changed = true; + break; + } + } } } return Changed; @@ -4555,8 +4542,7 @@ // may cause a significant compile-time increase. for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { unsigned Len = std::min(CE - CI, 16); - Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), - -SLPCostThreshold, R); + Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R); } } return Changed; Index: test/Transforms/SLPVectorizer/AArch64/slp-vectorized-from-max-to-min.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/AArch64/slp-vectorized-from-max-to-min.ll @@ -0,0 +1,37 @@ +;RUN: opt -S -slp-vectorizer -slp-max-reg-size=128 -slp-min-reg-size=64 -slp-threshold=-13 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK: @foo +; CHECK: add nsw <2 x i64> +; CHECK: add nsw <2 x i64> + +define i64 @foo(i64* nocapture readonly %a) #0 { +entry: + %idx1 = getelementptr inbounds i64, i64* %a, i64 1 + %idx2 = getelementptr inbounds i64, i64* %a, i64 2 + %idx3 = getelementptr inbounds i64, i64* %a, i64 3 + %idx4 = getelementptr inbounds i64, i64* %a, i64 4 + %idx5 = getelementptr inbounds i64, i64* %a, i64 5 + %idx6 = getelementptr inbounds i64, i64* %a, i64 6 + %idx7 = getelementptr inbounds i64, i64* %a, i64 7 + %0 = load i64, i64* %a, align 4 + %1 = load i64, i64* %idx1, align 4 + %2 = load i64, i64* %idx2, align 4 + %3 = load i64, i64* %idx3, align 4 + %4 = load i64, i64* %idx4, align 4 + %5 = load i64, i64* %idx5, align 4 + %6 = load i64, i64* %idx6, align 4 + %7 = load i64, i64* %idx7, align 4 + %add = add nsw i64 %1, %0 + %add1 = add nsw i64 %3, %2 + %add2 = add nsw i64 %5, %4 + %add3 = add nsw i64 %7, %6 + %add8 = add nsw i64 %add1, %add + %add9 = add nsw i64 %add3, %add2 + %add12 = add nsw i64 %add9, %add8 + ret i64 %add12 +} + +