Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -175,6 +175,10 @@ static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes); + /// Returns true if Phi is first-order recurrence. + static bool isFirstOrderRecurrence(PHINode *Current, Loop *TheLoop, + DominatorTree *DT); + RecurrenceKind getRecurrenceKind() { return Kind; } MinMaxRecurrenceKind getMinMaxRecurrenceKind() { return MinMaxKind; } Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -514,6 +514,44 @@ return false; } +bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Current, + Loop *TheLoop, + DominatorTree *DT) { + + // Ensure the phi is in the loop and that the loop has a single latch block. + // The loop vectorizer will need a latch block to set up the next iteration + // of the loop. + if (Current->getParent() != TheLoop->getHeader() || !TheLoop->getLoopLatch()) + return false; + + // A first-order recurrence looks like "current = phi(initial, previous)". + // Ensure the phi has only two incoming values. + if (Current->getNumIncomingValues() != 2) + return false; + + // Get the initial and previous values. + auto *Initial = Current->getIncomingValue(0); + auto *Previous = Current->getIncomingValue(1); + + // Ensure the initial value is loop-invariant and the previous value is + // loop-varying. We already know the current value is loop-varying. + if (!TheLoop->isLoopInvariant(Initial)) + std::swap(Initial, Previous); + if (!TheLoop->isLoopInvariant(Initial) || TheLoop->isLoopInvariant(Previous)) + return false; + + // Ensure every user of the current value is loop-varying and dominated by + // the previous value. The dominance requirement implies that the loop + // vectorizer will not need to vectorize the initial value outside the loop. + for (User *U : Current->users()) + if (auto *I = dyn_cast(U)) + if (TheLoop->isLoopInvariant(I) || + !DT->dominates(cast(Previous), I)) + return false; + + return true; +} + /// This function returns the identity element (or neutral element) for /// the operation K. Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurrenceKind K, Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -364,6 +364,22 @@ /// Copy and widen the instructions from the old loop. virtual void vectorizeLoop(); + /// Vectorize first-order recurrences. + /// + /// A first-order recurrence is a non-reduction recurrence relation in which + /// the value of the recurrence in the current loop iteration equals a value + /// defined in the previous iteration. For example, in the loop below: + /// + /// for (int i = 0; i < n; ++i) + /// b[i] = a[i] - a[i - 1]; + /// + /// There is a first-order recurrence on "a". The recurrence is a PHI node of + /// the form: + /// + /// recurrence = phi [a[-1], loop.preheader], [a[i], loop.header] + /// + void vectorizeFirstOrderRecurrence(PHINode *Phi); + /// \brief The Loop exit block may have single value PHI nodes where the /// incoming value is 'Undef'. While vectorizing we only handled real values /// that were defined inside the loop. Here we fix the 'undef case'. @@ -1201,6 +1217,10 @@ /// induction descriptor. typedef MapVector InductionList; + /// RecurrenceSet contains the phi nodes that are recurrences other than + /// inductions and reductions. + typedef SmallPtrSet RecurrenceSet; + /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -1215,6 +1235,9 @@ /// Returns the induction variables found in the loop. InductionList *getInductionVars() { return &Inductions; } + /// Return the first-order recurrences found in the loop. + RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; } + /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } @@ -1224,6 +1247,9 @@ /// Returns True if PN is a reduction variable in this loop. bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } + /// Returns True if Phi is a first-order recurrence in this loop. + bool isFirstOrderRecurrence(const PHINode *Phi); + /// Return true if the block BB needs to be predicated in order for the loop /// to be vectorized. bool blockNeedsPredication(BasicBlock *BB); @@ -1373,6 +1399,8 @@ /// Notice that inductions don't need to start at zero and that induction /// variables can be pointers. InductionList Inductions; + /// Holds the phi nodes that are first-order recurrences. + RecurrenceSet FirstOrderRecurrences; /// Holds the widest induction type encountered. Type *WidestIndTy; @@ -3294,8 +3322,14 @@ for (PHINode *Phi : PHIsToFix) { assert(Phi && "Unable to recover vectorized PHI"); - // We currently only handle reductions. Ensure the PHI node to be fixed is - // a reduction, and get its reduction variable descriptor. + // Handle first-order recurrences that need to be fixed. + if (Legal->isFirstOrderRecurrence(Phi)) { + vectorizeFirstOrderRecurrence(Phi); + continue; + } + + // If the PHI node is not a first-order recurrence, it must be a reduction. + // Get it's reduction variable descriptor. assert(Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"); RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; @@ -3521,6 +3555,109 @@ cse(LoopVectorBody); } +void InnerLoopVectorizer::vectorizeFirstOrderRecurrence(PHINode *Phi) { + + // Ensure the phi is in the original loop header. + assert(Phi->getParent() == OrigLoop->getHeader() && + "Phi isn't in loop header"); + + // Get the single loop latch. We ensured the loop had a single latch when + // detecting the recurrence. + auto *Latch = LI->getLoopFor(LoopVectorBody[0])->getLoopLatch(); + assert(Latch && "Loop doesn't have single latch"); + + // Ensure the phi has two incoming values. + assert(Phi->getNumIncomingValues() == 2 && + "Phi doesn't have 2 incoming values"); + + // Get the initial and previous values of the scalar recurrence. First-order + // recurrences are of the form "current = phi(initial, previous)". + auto *ScalarInit = Phi->getIncomingValue(0); + auto *Previous = Phi->getIncomingValue(1); + if (!OrigLoop->isLoopInvariant(ScalarInit)) + std::swap(ScalarInit, Previous); + + // Ensure the previous value is loop-varying and the initial value is + // loop-invariant. + assert(!OrigLoop->isLoopInvariant(Previous) && + "Previous value is loop-varying"); + assert(OrigLoop->isLoopInvariant(ScalarInit) && + "Initial value isn't loop-invariant"); + + // Create a vector from the initial value. + auto *VectorInit = ScalarInit; + if (VF > 1) { + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + VectorInit = Builder.CreateInsertElement( + UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, + Builder.getInt32(VF - 1), "vector.recur.init"); + } + + // We constructed a temporary phi when vectorizing the loop. This phi will + // eventually be deleted. + auto &PhiParts = getVectorValue(Phi); + Builder.SetInsertPoint(cast(PhiParts[0])); + + // Create a phi for the new recurrence. The current value will either be the + // splatted initial value or loop-varying vector value. + SmallVector CurrentParts; + PHINode *Vec = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); + Vec->addIncoming(VectorInit, LoopVectorPreHeader); + CurrentParts.push_back(Vec); + + // Get the vectorized previous value. We know the previous value is an + // instruction because we ensured it was loop-varying. + auto &PreviousParts = getVectorValue(Previous); + + // Set the insertion point to be after this instruction. We ensured the + // previous value dominated all uses of the phi when detecting the + // recurrence. + Builder.SetInsertPoint( + &*++BasicBlock::iterator(cast(PreviousParts[UF - 1]))); + + // We will construct a vector for the recurrence by combining the values for + // the current and previous iterations. This is the required shuffle mask. + SmallVector ShuffleMask(VF); + ShuffleMask[0] = Builder.getInt32(VF - 1); + for (unsigned I = 1; I < VF; ++I) + ShuffleMask[I] = Builder.getInt32(I + VF - 1); + + // Shuffle the current and previous vector and update the vector parts. + for (unsigned Part = 0; Part < UF; ++Part) { + auto *Shuffle = VF > 1 + ? Builder.CreateShuffleVector( + CurrentParts[Part], PreviousParts[Part], + ConstantVector::get(ShuffleMask)) + : CurrentParts[Part]; + PhiParts[Part]->replaceAllUsesWith(Shuffle); + cast(PhiParts[Part])->eraseFromParent(); + PhiParts[Part] = Shuffle; + CurrentParts.push_back(PreviousParts[Part]); + } + + // Fix the latch value of the new recurrence in the vector loop. + Vec->addIncoming(CurrentParts.back(), Latch); + + // Extract the last vector element in the middle block. This will be the + // initial value for the recurrence when jumping to the scalar loop. + auto *Extract = CurrentParts.back(); + if (VF > 1) { + Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); + Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1), + "vector.recur.extract"); + } + + // Fix the initial value of the original recurrence in the scalar loop. + Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); + auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); + for (auto *BB : predecessors(LoopScalarPreHeader)) { + auto *Incoming = BB == LoopMiddleBlock ? Extract : ScalarInit; + Start->addIncoming(Incoming, BB); + } + Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start); + Phi->setName("scalar.recur"); +} + void InnerLoopVectorizer::fixLCSSAPHIs() { for (BasicBlock::iterator LEI = LoopExitBlock->begin(), LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { @@ -3595,14 +3732,14 @@ Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, unsigned VF, PhiVector *PV) { PHINode* P = cast(PN); - // Handle reduction variables: - if (Legal->isReductionVariable(P)) { - for (unsigned part = 0; part < UF; ++part) { - // This is phase one of vectorizing PHIs. - Type *VecTy = (VF == 1) ? PN->getType() : - VectorType::get(PN->getType(), VF); - Entry[part] = PHINode::Create( - VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt()); + + // Handle recurrences. We create a temporary phi in the vector type that we + // will fix later. + if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { + for (unsigned Part = 0; Part < UF; ++Part) { + auto *Ty = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); + Entry[Part] = PHINode::Create( + Ty, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt()); } PV->push_back(P); return; @@ -4296,6 +4433,11 @@ continue; } + if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, DT)) { + FirstOrderRecurrences.insert(Phi); + continue; + } + emitAnalysis(VectorizationReport(&*it) << "value that could not be identified as " "reduction is used outside the loop"); @@ -4473,6 +4615,10 @@ return Inductions.count(PN); } +bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { + return FirstOrderRecurrences.count(Phi); +} + bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } @@ -5337,9 +5483,17 @@ case Instruction::Br: { return TTI.getCFInstrCost(I->getOpcode()); } - case Instruction::PHI: - //TODO: IF-converted IFs become selects. + case Instruction::PHI: { + auto *Phi = cast(I); + + // First-order recurrences are replaced by vector shuffles inside the loop. + if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) + return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + VectorTy, VF - 1, VectorTy); + + // TODO: IF-converted IFs become selects. return 0; + } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: Index: test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll @@ -0,0 +1,175 @@ +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: @recurrence_1 +; +; void recurrence_1(int *a, int *b, int n) { +; for(int i = 0; i < n; i++) +; b[i] = a[i] + a[i - 1] +; } +; +; CHECK: vector.ph: +; CHECK: %vector.recur.init = insertelement <4 x i32> +; +; CHECK: vector.body: +; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ] +; CHECK: shufflevector <4 x i32> %vector.recur +; +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <4 x i32> +; +; CHECK: scalar.ph: +; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ] +; +; CHECK: scalar.body: +; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ] +; +define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) { +entry: + br label %for.preheader + +for.preheader: + %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0 + %pre_load = load i32, i32* %arrayidx.phi.trans.insert + br label %scalar.body + +scalar.body: + %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ] + %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next + %1 = load i32, i32* %arrayidx32 + %arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %add35 = add i32 %1, %0 + store i32 %add35, i32* %arrayidx34 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.exit, label %scalar.body + +for.exit: + ret void +} + +; CHECK-LABEL: @recurrence_2 +; +; int recurrence_2(int *a, int n) { +; int minmax; +; for (int i = 0; i < n; ++i) +; minmax = min(minmax, max(a[i] - a[i-1], 0)); +; return minmax; +; } +; +; CHECK: vector.ph: +; CHECK: %vector.recur.init = insertelement <4 x i32> +; +; CHECK: vector.body: +; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ] +; CHECK: shufflevector <4 x i32> %vector.recur +; +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <4 x i32> +; +; CHECK: scalar.ph: +; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ] +; +; CHECK: scalar.body: +; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ] +; +define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) { +entry: + %cmp27 = icmp sgt i32 %n, 0 + br i1 %cmp27, label %scalar.body.preheader, label %for.cond.cleanup + +scalar.body.preheader: + %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1 + %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4 + br label %scalar.body + +for.cond.cleanup.loopexit: + %minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %scalar.body ] + br label %for.cond.cleanup + +for.cond.cleanup: + %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ] + ret i32 %minmax.0.lcssa + +scalar.body: + %0 = phi i32 [ %.pre, %scalar.body.preheader ], [ %1, %scalar.body ] + %indvars.iv = phi i64 [ 0, %scalar.body.preheader ], [ %indvars.iv.next, %scalar.body ] + %minmax.028 = phi i32 [ undef, %scalar.body.preheader ], [ %minmax.0.cond, %scalar.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %1 = load i32, i32* %arrayidx, align 4 + %sub3 = sub nsw i32 %1, %0 + %cmp4 = icmp sgt i32 %sub3, 0 + %cond = select i1 %cmp4, i32 %sub3, i32 0 + %cmp5 = icmp slt i32 %minmax.028, %cond + %minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %scalar.body +} + +; CHECK-LABEL: @recurrence_3 +; +; void recurrence_3(short *a, double *b, int n, float f, short p) { +; b[0] = (double)a[0] - f * (double)p; +; for (int i = 1; i < n; i++) +; b[i] = (double)a[i] - f * (double)a[i - 1]; +; } +; +; CHECK: vector.ph: +; CHECK: %vector.recur.init = insertelement <2 x i16> +; +; CHECK: vector.body: +; CHECK: %vector.recur = phi <2 x i16> [ %vector.recur.init, %vector.ph ] +; CHECK: shufflevector <2 x i16> %vector.recur +; +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <2 x i16> +; +; CHECK: scalar.ph: +; CHECK: %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ] +; +; CHECK: scalar.body: +; CHECK: %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ] +; +define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) { +entry: + %0 = load i16, i16* %a, align 2 + %conv = sitofp i16 %0 to double + %conv1 = fpext float %f to double + %conv2 = sitofp i16 %p to double + %mul = fmul fast double %conv2, %conv1 + %sub = fsub fast double %conv, %mul + store double %sub, double* %b, align 8 + %cmp25 = icmp sgt i32 %n, 1 + br i1 %cmp25, label %scalar.body.preheader, label %for.end + +scalar.body.preheader: + br label %scalar.body + +scalar.body: + %1 = phi i16 [ %0, %scalar.body.preheader ], [ %2, %scalar.body ] + %advars.iv = phi i64 [ %advars.iv.next, %scalar.body ], [ 1, %scalar.body.preheader ] + %arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv + %2 = load i16, i16* %arrayidx5, align 2 + %conv6 = sitofp i16 %2 to double + %conv11 = sitofp i16 %1 to double + %mul12 = fmul fast double %conv11, %conv1 + %sub13 = fsub fast double %conv6, %mul12 + %arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv + store double %sub13, double* %arrayidx15, align 8 + %advars.iv.next = add nuw nsw i64 %advars.iv, 1 + %lftr.wideiv = trunc i64 %advars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end.loopexit, label %scalar.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +}