Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -46,6 +46,7 @@ #include #include #include +#include using namespace llvm; @@ -74,6 +75,10 @@ static const unsigned RecursionMaxDepth = 12; +static bool IsReturn = false; + +static bool IsHAdd = false; + /// \returns the parent basic block if all of the instructions in \p VL /// are in the same block or null otherwise. static BasicBlock *getSameBlock(ArrayRef VL) { @@ -1620,6 +1625,10 @@ DEBUG(dbgs() << "SLP: Check whether the tree with height " << VectorizableTree.size() << " is fully vectorizable .\n"); + // Return true if a tree of size 1 feeds into a return and is horizontal Add. + if (VectorizableTree.size() == 1 && IsReturn && IsHAdd) + return true; + // We only handle trees of height 2. if (VectorizableTree.size() != 2) return false; @@ -3315,6 +3324,115 @@ : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} + /// Try to find a flat horizontal reduction. The tree structure of such + /// addition of type a[0]+a[1]+a[2]+a[3].... will be be like + /// (+(+(+ a[0], a[1]), a[2]), a[3]).... + /// Try to vectorize such tree for Integer type only. + bool matchFlatReduction(PHINode *Phi, BinaryOperator *B, + const DataLayout *DL) { + if (!B) + return false; + + if (B->getType()->isVectorTy() || !B->getType()->isIntegerTy()) + return false; + + ReductionOpcode = B->getOpcode(); + ReducedValueOpcode = 0; + ReduxWidth = MinVecRegSize / DL->getTypeAllocSizeInBits(B->getType()); + ReductionRoot = B; + ReductionPHI = Phi; + + if(ReduxWidth < 4) + return false; + + if(ReductionOpcode != Instruction::Add) + return false; + + SmallVector Stack; + Value* Op0 = B->getOperand(0); + Value* Op1 = B->getOperand(1); + + BinaryOperator* Op0Bin = dyn_cast(Op0); + BinaryOperator* Op1Bin = dyn_cast(Op1); + + // None of the operands are binary. + if(!Op0Bin && !Op1Bin) + return false; + + // Both of the operands are binary, we do not handle it here. + if(Op0Bin && Op1Bin) + return false; + + if(Op0Bin) { + Stack.push_back(Op0Bin); + ReducedVals.push_back(Op1); + } + else { + Stack.push_back(Op1Bin); + ReducedVals.push_back(Op0); + } + + ReductionOps.push_back(B); + ReductionOpcode = B->getOpcode(); + + // Traversal of the tree. + while(!Stack.empty()) { + BinaryOperator *Bin = Stack.back(); + + if(Bin->getParent() != B->getParent()) + return false; + + Value* Op0 = Bin->getOperand(0); + Value* Op1 = Bin->getOperand(1); + + BinaryOperator* Op0Bin = dyn_cast(Op0); + BinaryOperator* Op1Bin = dyn_cast(Op1); + + Stack.pop_back(); + + if(!Op0Bin && !Op1Bin) { + ReducedVals.push_back(Op1); + ReducedVals.push_back(Op0); + ReductionOps.push_back(Bin); + continue; + } + + if(Op0Bin) { + if (Op0Bin->getOpcode() != ReductionOpcode) + return false; + Stack.push_back(Op0Bin); + ReducedVals.push_back(Op1); + ReductionOps.push_back(Op0Bin); + } + + if(Op1Bin) { + if(Op1Bin->getOpcode() != ReductionOpcode) + return false; + Stack.push_back(Op1Bin); + ReducedVals.push_back(Op0); + ReductionOps.push_back(Op1Bin); + } + + } + + SmallVector Temp1; + + // Reverse the loads from a[3], a[2], a[1], a[0] + // to a[0], a[1], a[2], a[3] for checking incremental + // consecutiveness further ahead. + while(!ReducedVals.empty()) + Temp1.push_back(ReducedVals.pop_back_val()); + + ReducedVals.clear(); + + for(unsigned i = 0, e = Temp1.size(); igetType(); Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); + int HAddCost = INT_MAX; + // If horizontal addition pattern is identified, calculate cost. + // Such horizontal additions can be modeled into combination of + // shuffle sub-vectors and vector adds and one single extract element + // from last resultant vector. + // e.g. a[0]+a[1]+a[2]+a[3] can be modeled as + // %1 = load <4 x> %0 + // %2 = shuffle %1 <2, 3, undef, undef> + // %3 = add <4 x> %1, %2 + // %4 = shuffle %3 <1, undef, undef, undef> + // %5 = add <4 x> %3, %4 + // %6 = extractelement %5 <0> + if(IsHAdd) { + unsigned VecElem = VecTy->getVectorNumElements(); + unsigned NumRedxLevel = Log2_32(VecElem); + HAddCost = NumRedxLevel * (TTI->getArithmeticInstrCost(ReductionOpcode,VecTy) + + TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + VecTy, VecElem/2, VecTy)) + + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, 0); + } + int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; + VecReduxCost = HAddCost < VecReduxCost ? HAddCost : VecReduxCost; + int ScalarReduxCost = ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); @@ -3702,8 +3843,12 @@ if (BinaryOperator *BinOp = dyn_cast(RI->getOperand(0))) { DEBUG(dbgs() << "SLP: Found a return to vectorize.\n"); - if (tryToVectorizePair(BinOp->getOperand(0), - BinOp->getOperand(1), R)) { + HorizontalReduction HorRdx; + IsReturn = true; + if ((HorRdx.matchFlatReduction(nullptr, BinOp, DL) && + HorRdx.tryToReduce(R, TTI)) || + tryToVectorizePair(BinOp->getOperand(0), + BinOp->getOperand(1), R)) { Changed = true; it = BB->begin(); e = BB->end(); Index: test/Transforms/SLPVectorizer/AArch64/flatadd.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/flatadd.ll +++ test/Transforms/SLPVectorizer/AArch64/flatadd.ll @@ -0,0 +1,59 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a57 | FileCheck %s +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; return a[0]+a[1]+a[2]+a[3] + +; CHECK-LABEL: @flatadd1 +; CHECK: load <4 x i32>* +; CHECK: shufflevector <4 x i32> +; CHECK: add <4 x i32> +; CHECK: extractelement <4 x i32> +define i32 @flatadd1(i32* nocapture readonly %a) { +entry: + %0 = load i32* %a, align 4 + %arrayidx1 = getelementptr inbounds i32* %a, i32 1 + %1 = load i32* %arrayidx1, align 4 + %add = add nsw i32 %0, %1 + %arrayidx2 = getelementptr inbounds i32* %a, i32 2 + %2 = load i32* %arrayidx2, align 4 + %add3 = add nsw i32 %add, %2 + %arrayidx4 = getelementptr inbounds i32* %a, i32 3 + %3 = load i32* %arrayidx4, align 4 + %add5 = add nsw i32 %add3, %3 + ret i32 %add5 +} + +; return a[0]+a[1]+a[2]+a[3]+a[4]+a[5]+a[6]+a[7] + +; CHECK-LABEL: @flatadd2 +; CHECK: load <4 x i32>* +; CHECK: shufflevector <4 x i32> +; CHECK: add <4 x i32> +; CHECK: extractelement <4 x i32> +define i32 @flatadd2(i32* nocapture readonly %a) { +entry: + %0 = load i32* %a, align 4 + %arrayidx1 = getelementptr inbounds i32* %a, i64 1 + %1 = load i32* %arrayidx1, align 4 + %add = add nsw i32 %0, %1 + %arrayidx2 = getelementptr inbounds i32* %a, i64 2 + %2 = load i32* %arrayidx2, align 4 + %add3 = add nsw i32 %add, %2 + %arrayidx4 = getelementptr inbounds i32* %a, i64 3 + %3 = load i32* %arrayidx4, align 4 + %add5 = add nsw i32 %add3, %3 + %arrayidx6 = getelementptr inbounds i32* %a, i64 4 + %4 = load i32* %arrayidx6, align 4 + %add7 = add nsw i32 %add5, %4 + %arrayidx8 = getelementptr inbounds i32* %a, i64 5 + %5 = load i32* %arrayidx8, align 4 + %add9 = add nsw i32 %add7, %5 + %arrayidx10 = getelementptr inbounds i32* %a, i64 6 + %6 = load i32* %arrayidx10, align 4 + %add11 = add nsw i32 %add9, %6 + %arrayidx12 = getelementptr inbounds i32* %a, i64 7 + %7 = load i32* %arrayidx12, align 4 + %add13 = add nsw i32 %add11, %7 + ret i32 %add13 +}