Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -255,10 +255,10 @@ IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem). }; -public: /// Default constructor - creates an invalid induction. InductionDescriptor() - : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {} + : StartValue(nullptr), LoopExitInstr(nullptr), IK(IK_NoInduction), + StepValue(nullptr) {} /// Get the consecutive direction. Returns: /// 0 - unknown or non-consecutive. @@ -275,6 +275,7 @@ Value *transform(IRBuilder<> &B, Value *Index) const; Value *getStartValue() const { return StartValue; } + Instruction *getLoopExitInstr() const { return LoopExitInstr; } InductionKind getKind() const { return IK; } ConstantInt *getStepValue() const { return StepValue; } @@ -283,10 +284,13 @@ private: /// Private constructor - used by \c isInductionPHI. - InductionDescriptor(Value *Start, InductionKind K, ConstantInt *Step); + InductionDescriptor(Value *Start, Instruction *Exit, InductionKind K, + ConstantInt *Step); /// Start value. TrackingVH StartValue; + // The instruction which value is used outside of the loop. + Instruction *LoopExitInstr; /// Induction kind. InductionKind IK; /// Step value. Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -596,9 +596,9 @@ return Select; } -InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K, - ConstantInt *Step) - : StartValue(Start), IK(K), StepValue(Step) { +InductionDescriptor::InductionDescriptor(Value *Start, Instruction *Exit, + InductionKind K, ConstantInt *Step) + : StartValue(Start), LoopExitInstr(Exit), IK(K), StepValue(Step) { assert(IK != IK_NoInduction && "Not an induction"); assert(StartValue && "StartValue is null"); assert(StepValue && !StepValue->isZero() && "StepValue is zero"); @@ -661,6 +661,10 @@ "PHI is an AddRec for a different loop?!"); Value *StartValue = Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader()); + + auto *ExitInstr = dyn_cast( + Phi->getIncomingValueForBlock(AR->getLoop()->getLoopLatch())); + const SCEV *Step = AR->getStepRecurrence(*SE); // Calculate the pointer stride and check if it is consecutive. const SCEVConstant *C = dyn_cast(Step); @@ -669,7 +673,7 @@ ConstantInt *CV = C->getValue(); if (PhiTy->isIntegerTy()) { - D = InductionDescriptor(StartValue, IK_IntInduction, CV); + D = InductionDescriptor(StartValue, ExitInstr, IK_IntInduction, CV); return true; } @@ -690,7 +694,7 @@ return false; auto *StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size); - D = InductionDescriptor(StartValue, IK_PtrInduction, StepValue); + D = InductionDescriptor(StartValue, ExitInstr, IK_PtrInduction, StepValue); return true; } Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2759,6 +2759,37 @@ AddedSafetyChecks = true; } +struct PostIncInfo { + PHINode *Induction; + // If a loop is in LCSSA form there can only be one outside user. It's always + // a PHI node in the loop exit block. + PHINode *OutsideUser; +}; + +// Check if given ExitVal is an induction post-inc expression. +static Optional GetPostIncInfo(Instruction *ExitVal, + PHINode *IndVar, Loop *OrigLoop, + LoopVectorizationLegality *Legal) { + if (!ExitVal) + return None; + + // We need to find an outside user for the ExitVal. Loop is in LCSSA form, so + // there is only one outside user that is a phi node. + PHINode *OutsideUser = nullptr; + for (auto *U : ExitVal->users()) { + if (!OrigLoop->contains(cast(U))) { + assert(!OutsideUser && "More than one outside user - not LCSSA form"); + + OutsideUser = cast(U); + assert(OutsideUser->getParent() == OrigLoop->getExitBlock() && + "Not LCSSA form"); + } + } + if (!OutsideUser) + return None; + + return PostIncInfo{ IndVar, OutsideUser }; +} void InnerLoopVectorizer::createEmptyLoop() { /* @@ -2793,12 +2824,27 @@ ... */ + assert(OrigLoop->isLCSSAForm(*DT)); BasicBlock *OldBasicBlock = OrigLoop->getHeader(); BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); BasicBlock *ExitBlock = OrigLoop->getExitBlock(); assert(VectorPH && "Invalid loop structure"); assert(ExitBlock && "Must have an exit block"); + // If a loop exit values is a induction post-inc expression, map it to it's + // PostIncInfo. + DenseMap PostIncExpressions; + for (auto &IndInfo : *Legal->getInductionVars()) { + // All induction variables are phi nodes. + auto *InductionVar = cast(IndInfo.first); + + auto *ExitVal = IndInfo.second.getLoopExitInstr(); + Optional OptPostIncInfo = + GetPostIncInfo(ExitVal, InductionVar, OrigLoop, Legal); + if (OptPostIncInfo.hasValue()) + PostIncExpressions[ExitVal] = OptPostIncInfo.getValue(); + } + // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer // induction variables. In the code below we also support a case where we @@ -2875,6 +2921,10 @@ // If we come from a bypass edge then we need to start from the original // start value. + // Map end values of induction variables to their induction variables; + // used later as resume values of post-inc expressions. + DenseMap IndVarEndVals; + // This variable saves the new starting index for the scalar loop. It is used // to test if there are any tail iterations left once the vector loop has // completed. @@ -2901,6 +2951,9 @@ EndValue->setName("ind.end"); } + // Associate instruction with its end value - after the vector loop. + IndVarEndVals[OrigPhi] = EndValue; + // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. BCResumeVal->addIncoming(EndValue, MiddleBlock); @@ -2927,6 +2980,15 @@ // Get ready to start creating new instructions into the vectorized body. Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + // Now we have to repair broken post-inc expressions. We need to add new + // incoming arc to the PHI nodes that are their outside users. The incoming + // arcs are from MiddleBlock and have the value of the associated induction + // variables at the end of newly created vector loop. + for (auto &PIE : PostIncExpressions) { + auto *PHI = PIE.second.OutsideUser; + PHI->addIncoming(IndVarEndVals[PIE.second.Induction], MiddleBlock); + } + // Save the state. LoopVectorPreHeader = Lp->getLoopPreheader(); LoopScalarPreHeader = ScalarPH; @@ -4017,7 +4079,7 @@ /// \brief Check that the instruction has outside loop users and is not an /// identified reduction variable. static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, - SmallPtrSetImpl &Reductions) { + const SmallPtrSetImpl &Reductions) { // Reduction instructions are allowed to have exit users. All other // instructions must not have external users. if (!Reductions.count(Inst)) @@ -4088,6 +4150,10 @@ InductionDescriptor ID; if (InductionDescriptor::isInductionPHI(Phi, SE, ID)) { Inductions[Phi] = ID; + + if (ID.getLoopExitInstr() && TheLoop->isLCSSAForm(*DT)) + AllowedExit.insert(ID.getLoopExitInstr()); + // Get the widest type. if (!WidestIndTy) WidestIndTy = convertPointerToIntegerType(DL, PhiTy); @@ -4189,11 +4255,12 @@ if (LoadInst *LI = dyn_cast(it)) collectStridedAccess(LI); - // Reduction instructions are allowed to have exit users. - // All other instructions must not have external users. + // Reduction instructions and post-inc expressions are allowed to have + // exit users. All other instructions must not have external users. if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(VectorizationReport(it) << - "value cannot be used outside the loop"); + // Check if an instruction could be a post-inc expression. + emitAnalysis(VectorizationReport(it) + << "value cannot be used outside the loop"); return false; } Index: test/Transforms/LoopVectorize/post-incs.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/post-incs.ll @@ -0,0 +1,145 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Ensure that simple LCSSA loop with one induction post-inc expression gets +; vectorized. +; CHECK-LABEL: @t0 +; CHECK: %[[END:.*]] = getelementptr i8, i8* %dest, +; CHECK: vector.body +; CHECK: for.end.loopexit: +; CHECK: %incdec.ptr.lcssa = phi i8* [ %incdec.ptr, %for.body ], [ %[[END]], %middle.block ] +; CHECK: %ptr.lcssa = phi i8* [ %dest, %entry ], [ %incdec.ptr.lcssa, %for.end.loopexit ] +define void @t0(i32 %y, i32 %num, i8* noalias %dest) { +entry: + %cond = icmp sgt i32 %num, 0; + br i1 %cond, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %vv = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %ptr = phi i8* [ %incdec.ptr, %for.body ], [ %dest, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds i8, i8* %ptr, i64 1 + store i8 0, i8* %ptr, align 1 + %inc = add nuw nsw i32 %vv, 1 + %cmp = icmp slt i32 %inc, %num + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: + %incdec.ptr.lcssa = phi i8* [ %incdec.ptr, %for.body ] + br label %for.end + +for.end: + %ptr.lcssa = phi i8* [ %dest, %entry ], [ %incdec.ptr.lcssa, %for.end.loopexit ] + ret void +} + +; Ensure that simple loop with one induction post-inc expression gets vectorized. +; CHECK-LABEL: @t1 +; CHECK: %[[END:.*]] = getelementptr i8, i8* %dest, +; CHECK: vector.body +; CHECK: for.end.loopexit: +; CHECK: %[[LCSSA:.*]] = phi i8* [ %incdec.ptr, %for.body ], [ %[[END]], %middle.block ] +; CHECK: phi i8* [ %dest, %entry ], [ %[[LCSSA]], %for.end.loopexit ] +define void @t1(i32 %y, i8* noalias %dest, i32 %num) { +entry: + %cond = icmp sgt i32 %num, 0; + br i1 %cond, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %vv = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %ptr = phi i8* [ %incdec.ptr, %for.body ], [ %dest, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds i8, i8* %ptr, i64 1 + store i8 0, i8* %ptr, align 1 + %inc = add nuw nsw i32 %vv, 1 + %cmp = icmp slt i32 %inc, %num + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: + br label %for.end + +for.end: + %ptr.lcssa = phi i8* [ %dest, %entry ], [ %incdec.ptr, %for.end.loopexit ] + ret void +} + +; Ensure that a loop with 2 post-inc expressions gets vectorized +; and that resume values are correct. +; CHECK-LABEL: @t2 +; CHECK: %[[SEND:.*]] = getelementptr i8, i8* %src, +; CHECK: %[[DEND:.*]] = getelementptr i8, i8* %dest, +; CHECK: vector.body: +; CHECK: for.end.loopexit: +; CHECK: %[[LCSSA1:.*]] = phi i8* [ %incdec.ptr1, %for.body ], [ %[[DEND]], %middle.block ] +; CHECK-NEXT: %[[LCSSA:.*]] = phi i8* [ %incdec.ptr, %for.body ], [ %[[SEND]], %middle.block ] +; CHECK: %ptr.lcssa = phi i8* [ %src, %entry ], [ %[[LCSSA]], %for.end.loopexit ] +; CHECK-NEXT: %ptr1.lcssa = phi i8* [ %dest, %entry ], [ %[[LCSSA1]], %for.end.loopexit ] +define void @t2(i32 %y, i8* noalias %src, i8* noalias %dest, i32 %num) { +entry: + %cond = icmp sgt i32 %num, 0; + br i1 %cond, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %vv = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %p = phi i8* [ %incdec.ptr, %for.body ], [ %src, %for.body.preheader ] + %ptr = phi i8* [ %incdec.ptr1, %for.body ], [ %dest, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds i8, i8* %p, i64 1 + %0 = load i8, i8* %p, align 1 + %incdec.ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1 + store i8 %0, i8* %ptr, align 1 + %inc = add nuw nsw i32 %vv, 1 + %cmp = icmp slt i32 %inc, %num + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: + br label %for.end + +for.end: + %ptr.lcssa = phi i8* [ %src, %entry ], [ %incdec.ptr, %for.end.loopexit ] + %ptr1.lcssa = phi i8* [ %dest, %entry ], [ %incdec.ptr1, %for.end.loopexit ] + ret void +} + +; Ensure that a simple loop with integer post-inc expression gets vectorized, +; even though the exit value has different value when coming from the entry +; block than the induction start value. +; CHECK-LABEL: @t3 +; CHECK: %[[END:.*]] = add i32 0, +; CHECK: vector.body: +; CHECK: for.body: +; CHECK: %[[IND_VV:.*]] = phi i32 [ %[[INC:.*]], %for.body ], [ %[[RES:.*]], %scalar.ph ] +; CHECK: for.end.loopexit: +; CHECK: %[[LCSSA:.*]] = phi i32 [ %[[INC]], %for.body ], [ %[[END]], %middle.block ] +; CHECK: phi i32 [ %num, %entry ], [ %[[LCSSA]], %for.end.loopexit ] +define void @t3(i32 %y, i8* noalias %dest, i32 %num) { +entry: + %cond = icmp sgt i32 %num, 0; + br i1 %cond, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %vv = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %ptr = phi i8* [ %incdec.ptr1, %for.body ], [ %dest, %for.body.preheader ] + %incdec.ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1 + store i8 0, i8* %ptr, align 1 + %inc = add nuw nsw i32 %vv, 1 + %cmp = icmp slt i32 %inc, %num + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: + br label %for.end + +for.end: + %n.lcssa = phi i32 [ %num, %entry ], [ %inc, %for.end.loopexit ] + ret void +} \ No newline at end of file