Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -175,6 +175,10 @@ static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes); + /// Returns true if Phi is first-order recurrence. + static bool isFirstOrderRecurrence(PHINode *Current, Loop *TheLoop, + DominatorTree *DT); + RecurrenceKind getRecurrenceKind() { return Kind; } MinMaxRecurrenceKind getMinMaxRecurrenceKind() { return MinMaxKind; } Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -514,6 +514,44 @@ return false; } +bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Current, + Loop *TheLoop, + DominatorTree *DT) { + + // Ensure the phi is in the loop and that the loop has a single latch block. + // The loop vectorizer will need a latch block to set up the next iteration + // of the loop. + if (Current->getParent() != TheLoop->getHeader() || !TheLoop->getLoopLatch()) + return false; + + // A first-order recurrence looks like "current = phi(initial, previous)". + // Ensure the phi has only two incoming values. + if (Current->getNumIncomingValues() != 2) + return false; + + // Get the initial and previous values. + auto *Initial = Current->getIncomingValue(0); + auto *Previous = Current->getIncomingValue(1); + + // Ensure the initial value is loop-invariant and the previous value is + // loop-varying. We already know the current value is loop-varying. + if (!TheLoop->isLoopInvariant(Initial)) + std::swap(Initial, Previous); + if (!TheLoop->isLoopInvariant(Initial) || TheLoop->isLoopInvariant(Previous)) + return false; + + // Ensure every user of the current value is loop-varying and dominated by + // the previous value. The dominance requirement implies that the loop + // vectorizer will not need to vectorize the initial value outside the loop. + for (User *U : Current->users()) + if (auto *I = dyn_cast(U)) + if (TheLoop->isLoopInvariant(I) || + !DT->dominates(cast(Previous), I)) + return false; + + return true; +} + /// This function returns the identity element (or neutral element) for /// the operation K. Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurrenceKind K, Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -364,6 +364,9 @@ /// Copy and widen the instructions from the old loop. virtual void vectorizeLoop(); + /// Vectorize first-order recurrences. + void fixFirstOrderRecurrence(PHINode *Phi); + /// \brief The Loop exit block may have single value PHI nodes where the /// incoming value is 'Undef'. While vectorizing we only handled real values /// that were defined inside the loop. Here we fix the 'undef case'. @@ -1201,6 +1204,10 @@ /// induction descriptor. typedef MapVector InductionList; + /// RecurrenceSet contains the phi nodes that are recurrences other than + /// inductions and reductions. + typedef SmallPtrSet RecurrenceSet; + /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -1215,6 +1222,9 @@ /// Returns the induction variables found in the loop. InductionList *getInductionVars() { return &Inductions; } + /// Return the first-order recurrences found in the loop. + RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; } + /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } @@ -1224,6 +1234,9 @@ /// Returns True if PN is a reduction variable in this loop. bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } + /// Returns True if Phi is a first-order recurrence in this loop. + bool isFirstOrderRecurrence(const PHINode *Phi); + /// Return true if the block BB needs to be predicated in order for the loop /// to be vectorized. bool blockNeedsPredication(BasicBlock *BB); @@ -1373,6 +1386,8 @@ /// Notice that inductions don't need to start at zero and that induction /// variables can be pointers. InductionList Inductions; + /// Holds the phi nodes that are first-order recurrences. + RecurrenceSet FirstOrderRecurrences; /// Holds the widest induction type encountered. Type *WidestIndTy; @@ -3302,6 +3317,16 @@ PHINode *RdxPhi = *it; assert(RdxPhi && "Unable to recover vectorized PHI"); + // If RdxPhi is a first-order recurrence, it's not actually a reduction, + // but it still needs to be fixed. Vectorize it and continue. + // + // FIXME: We need to refactor this function since the recurrences in + // RdxPHIsToFix are not necessarily reductions. + if (Legal->isFirstOrderRecurrence(RdxPhi)) { + fixFirstOrderRecurrence(RdxPhi); + continue; + } + // Find the reduction variable descriptor. assert(Legal->isReductionVariable(RdxPhi) && "Unable to find the reduction variable"); @@ -3528,6 +3553,92 @@ cse(LoopVectorBody); } +void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { + + // Get the single loop latch. We ensured the loop had a single latch when + // detecting the recurrence. + auto *Latch = LI->getLoopFor(LoopVectorBody[0])->getLoopLatch(); + + // Get the initial and previous values of the scalar recurrence. First-order + // recurrences are of the form "current = phi(initial, previous)". + auto *Initial = Phi->getIncomingValue(0); + auto *Previous = Phi->getIncomingValue(1); + if (!OrigLoop->isLoopInvariant(Initial)) + std::swap(Initial, Previous); + + // Create a vector from the initial value. + if (VF > 1) { + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + Initial = Builder.CreateVectorSplat(VF, Initial, "vector.recur"); + Initial->setName("vector.recur.init"); + } + + // We constructed a temporary phi when vectorizing the loop. This phi will + // eventually be deleted. + auto &PhiParts = getVectorValue(Phi); + Builder.SetInsertPoint(cast(PhiParts[0])); + + // Create a phi for the new recurrence. The current value will either be the + // splatted initial value or loop-varying vector value. + SmallVector CurrentParts; + PHINode *Vec = Builder.CreatePHI(Initial->getType(), 2, "vector.recur"); + Vec->addIncoming(Initial, LoopVectorPreHeader); + CurrentParts.push_back(Vec); + + // Get the vectorized previous value. We know the previous value is an + // instruction because we ensured it was loop-varying. + auto &PreviousParts = getVectorValue(Previous); + + // Set the insertion point to be after this instruction. We ensured the + // previous value dominated all uses of the phi when detecting the + // recurrence. + Builder.SetInsertPoint( + &*++BasicBlock::iterator(cast(PreviousParts[UF - 1]))); + + // We will construct a vector for the recurrence by combining the values for + // the current and previous iterations. This is the required shuffle mask. + SmallVector ShuffleMask(VF); + ShuffleMask[0] = Builder.getInt32(VF - 1); + for (unsigned I = 1; I < VF; ++I) + ShuffleMask[I] = Builder.getInt32(I + VF - 1); + + // Shuffle the current and previous vector and update the vector parts. + for (unsigned Part = 0; Part < UF; ++Part) { + auto *Shuffle = VF > 1 + ? Builder.CreateShuffleVector( + CurrentParts[Part], PreviousParts[Part], + ConstantVector::get(ShuffleMask)) + : CurrentParts[Part]; + PhiParts[Part]->replaceAllUsesWith(Shuffle); + cast(PhiParts[Part])->eraseFromParent(); + PhiParts[Part] = Shuffle; + CurrentParts.push_back(PreviousParts[Part]); + } + + // Fix the latch value of the new recurrence in the vector loop. + Vec->addIncoming(CurrentParts.back(), Latch); + + // Extract the last vector element in the middle block. This will be the + // initial value for the recurrence when jumping to the scalar loop. + auto *Extract = CurrentParts.back(); + if (VF > 1) { + Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); + Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1), + "vector.recur.extract"); + Initial = Builder.CreateExtractElement(Initial, Builder.getInt32(0)); + } + + // Fix the initial value of the original recurrence in the scalar loop. + Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); + auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); + for (auto *BB : predecessors(LoopScalarPreHeader)) { + auto *Incoming = BB == LoopMiddleBlock ? Extract : Initial; + Start->addIncoming(Incoming, BB); + } + Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start); + Phi->setName("scalar.recur"); +} + void InnerLoopVectorizer::fixLCSSAPHIs() { for (BasicBlock::iterator LEI = LoopExitBlock->begin(), LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { @@ -3602,14 +3713,14 @@ Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, unsigned VF, PhiVector *PV) { PHINode* P = cast(PN); - // Handle reduction variables: - if (Legal->isReductionVariable(P)) { - for (unsigned part = 0; part < UF; ++part) { - // This is phase one of vectorizing PHIs. - Type *VecTy = (VF == 1) ? PN->getType() : - VectorType::get(PN->getType(), VF); - Entry[part] = PHINode::Create( - VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt()); + + // Handle recurrences. We create a temporary phi in the vector type that we + // will fix later. + if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { + for (unsigned Part = 0; Part < UF; ++Part) { + auto *Ty = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); + Entry[Part] = PHINode::Create( + Ty, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt()); } PV->push_back(P); return; @@ -4303,6 +4414,11 @@ continue; } + if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, DT)) { + FirstOrderRecurrences.insert(Phi); + continue; + } + emitAnalysis(VectorizationReport(&*it) << "value that could not be identified as " "reduction is used outside the loop"); @@ -4480,6 +4596,10 @@ return Inductions.count(PN); } +bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { + return FirstOrderRecurrences.count(Phi); +} + bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } @@ -5344,9 +5464,17 @@ case Instruction::Br: { return TTI.getCFInstrCost(I->getOpcode()); } - case Instruction::PHI: - //TODO: IF-converted IFs become selects. + case Instruction::PHI: { + auto *Phi = cast(I); + + // First-order recurrences are replaced by vector shuffles inside the loop. + if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) + return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + VectorTy, VF - 1, VectorTy); + + // TODO: IF-converted IFs become selects. return 0; + } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: Index: test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll @@ -0,0 +1,178 @@ +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: @recurrence_1 +; +; void recurrence_1(int *a, int *b, int n) { +; for(int i = 0; i < n; i++) +; b[i] = a[i] + a[i - 1] +; } +; +; CHECK: vector.ph: +; CHECK: %vector.recur.splatinsert = insertelement <4 x i32> +; CHECK: %vector.recur.init = shufflevector <4 x i32> %vector.recur.splatinsert +; +; CHECK: vector.body: +; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ] +; CHECK: shufflevector <4 x i32> %vector.recur +; +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <4 x i32> +; +; CHECK: scalar.ph: +; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ] +; +; CHECK: scalar.body: +; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ] +; +define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) { +entry: + br label %for.preheader + +for.preheader: + %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0 + %pre_load = load i32, i32* %arrayidx.phi.trans.insert + br label %scalar.body + +scalar.body: + %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ] + %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next + %1 = load i32, i32* %arrayidx32 + %arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %add35 = add i32 %1, %0 + store i32 %add35, i32* %arrayidx34 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.exit, label %scalar.body + +for.exit: + ret void +} + +; CHECK-LABEL: @recurrence_2 +; +; int recurrence_2(int *a, int n) { +; int minmax; +; for (int i = 0; i < n; ++i) +; minmax = min(minmax, max(a[i] - a[i-1], 0)); +; return minmax; +; } +; +; CHECK: vector.ph: +; CHECK: %vector.recur.splatinsert = insertelement <4 x i32> +; CHECK: %vector.recur.init = shufflevector <4 x i32> %vector.recur.splatinsert +; +; CHECK: vector.body: +; CHECK: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ] +; CHECK: shufflevector <4 x i32> %vector.recur +; +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <4 x i32> +; +; CHECK: scalar.ph: +; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ] +; +; CHECK: scalar.body: +; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ] +; +define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) { +entry: + %cmp27 = icmp sgt i32 %n, 0 + br i1 %cmp27, label %scalar.body.preheader, label %for.cond.cleanup + +scalar.body.preheader: + %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1 + %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4 + br label %scalar.body + +for.cond.cleanup.loopexit: + %minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %scalar.body ] + br label %for.cond.cleanup + +for.cond.cleanup: + %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ] + ret i32 %minmax.0.lcssa + +scalar.body: + %0 = phi i32 [ %.pre, %scalar.body.preheader ], [ %1, %scalar.body ] + %indvars.iv = phi i64 [ 0, %scalar.body.preheader ], [ %indvars.iv.next, %scalar.body ] + %minmax.028 = phi i32 [ undef, %scalar.body.preheader ], [ %minmax.0.cond, %scalar.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %1 = load i32, i32* %arrayidx, align 4 + %sub3 = sub nsw i32 %1, %0 + %cmp4 = icmp sgt i32 %sub3, 0 + %cond = select i1 %cmp4, i32 %sub3, i32 0 + %cmp5 = icmp slt i32 %minmax.028, %cond + %minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %scalar.body +} + +; CHECK-LABEL: @recurrence_3 +; +; void recurrence_3(short *a, double *b, int n, float f, short p) { +; b[0] = (double)a[0] - f * (double)p; +; for (int i = 1; i < n; i++) +; b[i] = (double)a[i] - f * (double)a[i - 1]; +; } +; +; CHECK: vector.ph: +; CHECK: %vector.recur.splatinsert = insertelement <2 x i16> +; CHECK: %vector.recur.init = shufflevector <2 x i16> %vector.recur.splatinsert +; +; CHECK: vector.body: +; CHECK: %vector.recur = phi <2 x i16> [ %vector.recur.init, %vector.ph ] +; CHECK: shufflevector <2 x i16> %vector.recur +; +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <2 x i16> +; +; CHECK: scalar.ph: +; CHECK: %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ] +; +; CHECK: scalar.body: +; CHECK: %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ] +; +define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) { +entry: + %0 = load i16, i16* %a, align 2 + %conv = sitofp i16 %0 to double + %conv1 = fpext float %f to double + %conv2 = sitofp i16 %p to double + %mul = fmul fast double %conv2, %conv1 + %sub = fsub fast double %conv, %mul + store double %sub, double* %b, align 8 + %cmp25 = icmp sgt i32 %n, 1 + br i1 %cmp25, label %scalar.body.preheader, label %for.end + +scalar.body.preheader: + br label %scalar.body + +scalar.body: + %1 = phi i16 [ %0, %scalar.body.preheader ], [ %2, %scalar.body ] + %advars.iv = phi i64 [ %advars.iv.next, %scalar.body ], [ 1, %scalar.body.preheader ] + %arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv + %2 = load i16, i16* %arrayidx5, align 2 + %conv6 = sitofp i16 %2 to double + %conv11 = sitofp i16 %1 to double + %mul12 = fmul fast double %conv11, %conv1 + %sub13 = fsub fast double %conv6, %mul12 + %arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv + store double %sub13, double* %arrayidx15, align 8 + %advars.iv.next = add nuw nsw i64 %advars.iv, 1 + %lftr.wideiv = trunc i64 %advars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end.loopexit, label %scalar.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +}