Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -175,6 +175,11 @@ static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes); + /// Returns true if Phi is a pre-load recurrence in TheLoop. A pre-load + /// recurrence is a phi of loads for the current and previous loop + /// iterations. + static bool isPreLoadPHI(PHINode *Phi, Loop *TheLoop, ScalarEvolution *SE); + RecurrenceKind getRecurrenceKind() { return Kind; } MinMaxRecurrenceKind getMinMaxRecurrenceKind() { return MinMaxKind; } Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -514,6 +514,69 @@ return false; } +bool RecurrenceDescriptor::isPreLoadPHI(PHINode *Phi, Loop *TheLoop, + ScalarEvolution *SE) { + + // Ensure the Phi has two incoming values and is only used once. + if (Phi->getNumIncomingValues() != 2 || !Phi->hasOneUse()) + return false; + + // Ensure the single use of the Phi is in the loop header. + auto *User = dyn_cast(*Phi->user_begin()); + if (!User || User->getParent() != TheLoop->getHeader()) + return false; + + // Ensure the Phi is an integer or floating point type. + Type *Ty = Phi->getType(); + if (!Ty->isIntegerTy() && !Ty->isFloatingPointTy()) + return false; + + // Ensure the two incoming blocks are the loop header and preheader. + if (Phi->getBasicBlockIndex(TheLoop->getHeader()) < 0 || + !TheLoop->getLoopPreheader() || + Phi->getBasicBlockIndex(TheLoop->getLoopPreheader()) < 0) + return false; + + // Ensure the the two incoming values are loads. + auto *Load = + dyn_cast(Phi->getIncomingValueForBlock(TheLoop->getHeader())); + auto *PreLoad = dyn_cast( + Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader())); + if (!PreLoad || !Load) + return false; + + // Ensure Phi is in the loop header and PreLoad is in the loop preheader. + // This also ensures that Load is in the loop header. + if (Phi->getParent() != TheLoop->getHeader() || + PreLoad->getParent() != TheLoop->getLoopPreheader()) + return false; + + // We will now ensure Preload and Load (on the first iteration of the loop) + // are consecutive accesses. First, get the add recurrence for the pointer + // operand of the loop-varying load. + auto *AR = dyn_cast(SE->getSCEV(Load->getPointerOperand())); + if (!AR) + return false; + + // Next, calculate the stride of the pointer operand. + auto *Step = AR->getStepRecurrence(*SE); + if (!isa(Step)) + return false; + + // Next, evaluate the add recurrence on the first iteration of the loop. + auto *IterationZero = + AR->evaluateAtIteration(SE->getConstant(APInt(64, 0)), *SE); + + // Finally, Ensure that the pointer operand of the loop-invariant load plus + // the loop step size equals the pointer operand of the loop-varying load on + // the first iteration of the loop. + if (SE->getAddExpr(SE->getSCEV(PreLoad->getPointerOperand()), Step) != + IterationZero) + return false; + + return true; +} + /// This function returns the identity element (or neutral element) for /// the operation K. Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurrenceKind K, Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -364,6 +364,10 @@ /// Copy and widen the instructions from the old loop. virtual void vectorizeLoop(); + /// Turn the scalar loop-invariant load of a pre-load recurrence into a + /// vectorized loop-varying load. + void fixPreLoadPHI(PHINode *Phi); + /// \brief The Loop exit block may have single value PHI nodes where the /// incoming value is 'Undef'. While vectorizing we only handled real values /// that were defined inside the loop. Here we fix the 'undef case'. @@ -1201,6 +1205,11 @@ /// induction descriptor. typedef MapVector InductionList; + /// PreLoadPHISet contains the phi nodes that are pre-load recurrences. A + /// pre-load recurrence is a phi of loads for the current and previous loop + /// iterations. + typedef SmallPtrSet PreLoadPHISet; + /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -1215,6 +1224,9 @@ /// Returns the induction variables found in the loop. InductionList *getInductionVars() { return &Inductions; } + /// Return the pre-load recurrences found in the loop. + PreLoadPHISet *getPreLoadPHIs() { return &PreLoadPHIs; } + /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } @@ -1224,6 +1236,9 @@ /// Returns True if PN is a reduction variable in this loop. bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } + /// Returns True if Phi is a pre-load recurrence in this loop. + bool isPreLoadPHI(const PHINode *Phi); + /// Return true if the block BB needs to be predicated in order for the loop /// to be vectorized. bool blockNeedsPredication(BasicBlock *BB); @@ -1312,6 +1327,14 @@ /// transformation. bool canVectorizeWithIfConvert(); + /// Return true if we can sink the loop-invariant load of the pre-load + /// recurrence Phi into the body of loop L. + /// + /// TODO: We can make this check much more precise by utilizing a memory + /// dependence analysis and by considering more complicated + /// control-flow. + bool canSinkPreLoadPHI(PHINode *Phi, Loop *L); + /// Collect the variables that need to stay uniform after vectorization. void collectLoopUniforms(); @@ -1373,6 +1396,8 @@ /// Notice that inductions don't need to start at zero and that induction /// variables can be pointers. InductionList Inductions; + /// Holds the phi nodes that are pre-load recurrences. + PreLoadPHISet PreLoadPHIs; /// Holds the widest induction type encountered. Type *WidestIndTy; @@ -3302,6 +3327,16 @@ PHINode *RdxPhi = *it; assert(RdxPhi && "Unable to recover vectorized PHI"); + // If RdxPhi is a pre-load recurrence, it's not actually a reduction, but + // it still needs to be fixed. Vectorize it and continue. + // + // FIXME: We need to refactor this function since the recurrences in + // RdxPHIsToFix are not necessarily reductions. + if (Legal->isPreLoadPHI(RdxPhi)) { + fixPreLoadPHI(RdxPhi); + continue; + } + // Find the reduction variable descriptor. assert(Legal->isReductionVariable(RdxPhi) && "Unable to find the reduction variable"); @@ -3528,6 +3563,108 @@ cse(LoopVectorBody); } +void InnerLoopVectorizer::fixPreLoadPHI(PHINode *Phi) { + + // The original loop header and its corresponding loop-varying load. + auto *Header = OrigLoop->getHeader(); + auto *Load = cast(Phi->getIncomingValueForBlock(Header)); + + // The data layout for this module. + auto &DL = Header->getModule()->getDataLayout(); + + // The original loop preheader and its corresponding loop-invariant load. + // This load is used on the first iteration of the loop. + auto *PreHeader = OrigLoop->getLoopPreheader(); + auto *PreLoad = Phi->getIncomingValueForBlock(PreHeader); + + // Get the negated step size of the original loop. Note that these casts were + // checked when we original detected the PreLoadPHI. + auto *AR = cast(PSE.getSCEV(Load->getPointerOperand())); + auto *SR = cast(AR->getStepRecurrence(*PSE.getSE())); + int LoadSize = DL.getTypeStoreSize(Load->getType()); + int StepSize = -1 * SR->getValue()->getSExtValue() / LoadSize; + + // Get the VectorParts we will need. We ensured the Phi had only one user + // during legalization. + auto &LoadParts = getVectorValue(Load); + auto &PhiParts = getVectorValue(Phi); + auto &UserParts = getVectorValue(*Phi->user_begin()); + + // We are going to sink PreLoad into the vectorized loop. We will do this by + // duplicating the loop-varying load and subtracting one iteration from its + // pointer operand. We will map Phi to this new load in the ValueMap. + // However, because the loop-varying load has already been vectorized, we + // need to first save its VectorParts before we overwrite them. + VectorParts TempParts(UF); + for (unsigned I = 0; I < UF; ++I) + TempParts[I] = LoadParts[I]; + + // Set the insertion point before the vectorized version of the Phi's single + // user, and create another copy of the loop-varying load. + Builder.SetInsertPoint(cast(UserParts[0])); + vectorizeMemoryInstruction(Load); + + for (unsigned Part = 0; Part < UF; ++Part) { + + // If Phi is a PreLoadPHI, Load should not have been scalarized. Thus, its + // corresponding vector parts should either be loads (for non-interleaved + // accesses) or shuffles (for interleaved accesses). The code below finds + // the vector load or asserts if one is not found. + auto *V = LoadParts[Part]; + if (auto *Shuffle = dyn_cast(V)) + V = Shuffle->getOperand(0); + assert(isa(V) && "Vector part is neither a load nor shuffle"); + auto *LoadPart = cast(V); + Builder.SetInsertPoint(LoadPart); + + // Get the pointer operand of LoadPart and cast it to a scalar type. + auto *Ptr = LoadPart->getPointerOperand(); + if (auto *VecTy = dyn_cast(LoadPart->getType())) + Ptr = Builder.CreateBitOrPointerCast( + Ptr, PointerType::getUnqual(VecTy->getElementType())); + + // Subtract one iteration from the pointer operand. + auto *GEP = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(StepSize)); + + // Cast the pointer operand back to the vector type. + if (Ptr != LoadPart->getPointerOperand()) + GEP = Builder.CreateBitOrPointerCast( + GEP, LoadPart->getPointerOperand()->getType()); + + // Update the pointer operand of LoadPart to refer to the new location. + LoadPart->setOperand(0, GEP); + + // Replace the temporary vector values we created for Phi with the new + // loads. We also restore the original LoadParts since it was just + // overwritten with the mapping created for the Phi. + PhiParts[Part]->replaceAllUsesWith(LoadParts[Part]); + PhiParts[Part] = LoadParts[Part]; + LoadParts[Part] = TempParts[Part]; + } + + // If the vector loop jumps to the scalar version to complete the required + // iterations, we need to first extract the last vector value. We have to + // recreate the PreLoadPHI for the scalar loop, ensuring that the first + // iteration is correct. + Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); + auto *Extract = + Builder.CreateExtractElement(LoadParts[UF - 1], Builder.getInt32(VF - 1)); + + // Create a new PHI for the starting value of the scalar PreLoadPHI. This + // will either be the original PreLoad or the value we extracted from the + // vector when exiting the vector loop. + Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); + auto *PreLoadPHI = Builder.CreatePHI(PreLoad->getType(), 2); + for (auto *BB : predecessors(LoopScalarPreHeader)) { + auto *Incoming = BB == LoopMiddleBlock ? Extract : PreLoad; + PreLoadPHI->addIncoming(Incoming, BB); + } + + // Finally, update the incoming value of the scalar PreLoadPHI. + Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), + PreLoadPHI); +} + void InnerLoopVectorizer::fixLCSSAPHIs() { for (BasicBlock::iterator LEI = LoopExitBlock->begin(), LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { @@ -3615,6 +3752,21 @@ return; } + // Handle pre-load recurrences. We create a temporary load within the loop + // for the loop-invariant load of the pre-load recurrence. We will fix this + // load later after all induction variables have been generated. + if (Legal->isPreLoadPHI(P)) { + auto *Ty = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); + auto *LI = + cast(P->getIncomingValueForBlock(OrigLoop->getHeader())); + for (unsigned part = 0; part < UF; ++part) + Entry[part] = new LoadInst( + UndefValue::get(PointerType::get(Ty, LI->getPointerAddressSpace())), + "vec.pre", &*LoopVectorBody.back()->getFirstInsertionPt()); + PV->push_back(P); + return; + } + setDebugLocFromInst(Builder, P); // Check for PHI nodes that are lowered to vector selects. if (P->getParent() != OrigLoop->getHeader()) { @@ -4303,6 +4455,14 @@ continue; } + // If Phi is a pre-load recurrence, we also have to ensure that we can + // sink its loop-invariant load into the body of the loop. + if (RecurrenceDescriptor::isPreLoadPHI(Phi, TheLoop, PSE.getSE())) + if (canSinkPreLoadPHI(Phi, TheLoop)) { + PreLoadPHIs.insert(Phi); + continue; + } + emitAnalysis(VectorizationReport(&*it) << "value that could not be identified as " "reduction is used outside the loop"); @@ -4393,6 +4553,36 @@ return true; } +bool LoopVectorizationLegality::canSinkPreLoadPHI(PHINode *Phi, Loop *L) { + + // The loop header and preheader. We ensured the loop had a preheader when + // detecting the PreLoadPHI. + auto *Header = L->getHeader(); + auto *PreHeader = L->getLoopPreheader(); + + // The loop-invariant load and the Phi's single user. We checked these casts + // when detecting the PreLoadPHI. + auto *PreLoad = cast(Phi->getIncomingValueForBlock(PreHeader)); + auto *User = dyn_cast(*Phi->user_begin()); + + // Ensure there is no store between the loop-invariant load we are going to + // sink and the end of the loop preheader. We ensured the loop-invariant load + // is in the preheader when detecting the PreLoadPHI. + using Iter = BasicBlock::iterator; + for (auto I = Iter(PreLoad), E = PreHeader->end(); I != E; ++I) + if (isa(&*I)) + return false; + + // Ensure there is no store between the beginning of the loop header and the + // location to which we're going to sink the load. + for (auto I = Header->begin(), E = Iter(User); I != E; ++I) + if (isa(&*I)) + return false; + + // If we didn't see any stores, it's safe to sink the loop-invariant load. + return true; +} + void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { Value *Ptr = nullptr; if (LoadInst *LI = dyn_cast(MemAccess)) @@ -4480,6 +4670,10 @@ return Inductions.count(PN); } +bool LoopVectorizationLegality::isPreLoadPHI(const PHINode *Phi) { + return PreLoadPHIs.count(Phi); +} + bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } @@ -5327,6 +5521,12 @@ if (Legal->isUniformAfterVectorization(I)) VF = 1; + // If the instruction is a pre-load recurrence, it will be replaced by a load + // within the loop. Update the instruction to account for the load. + if (auto *Phi = dyn_cast(I)) + if (VF > 1 && Legal->isPreLoadPHI(Phi)) + I = cast(Phi->getIncomingValueForBlock(TheLoop->getHeader())); + Type *RetTy = I->getType(); if (VF > 1 && MinBWs.count(I)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); Index: test/Transforms/LoopVectorize/AArch64/pre-load-recurrence.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AArch64/pre-load-recurrence.ll @@ -0,0 +1,147 @@ +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: @pre_load_1 +; +; void pre_load_1(int *a, int *b, int n) { +; for(int i = 0; i < n; i++) +; b[i] = a[i] + a[i - 1] +; } +; +; CHECK: vector.body: +; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <4 x i32> +; +; CHECK: middle.block: +; CHECK: extractelement <4 x i32> [[L]], i32 3 +; +define void @pre_load_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) { +entry: + br label %for.preheader + +for.preheader: + %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0 + %pre_load = load i32, i32* %arrayidx.phi.trans.insert + br label %for.body + +for.body: + %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %for.body ] + %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next + %1 = load i32, i32* %arrayidx32 + %arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %add35 = add i32 %1, %0 + store i32 %add35, i32* %arrayidx34 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.exit, label %for.body + +for.exit: + ret void +} + +; CHECK-LABEL: @pre_load_2 +; +; int pre_load_2(int *a, int n) { +; int minmax; +; for (int i = 0; i < n; ++i) +; minmax = min(minmax, max(a[i] - a[i-1], 0)); +; return minmax; +; } +; +; CHECK: vector.body: +; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <4 x i32> +; +; CHECK: middle.block: +; CHECK: extractelement <4 x i32> [[L]], i32 3 +; +define i32 @pre_load_2(i32* nocapture readonly %a, i32 %n) { +entry: + %cmp27 = icmp sgt i32 %n, 0 + br i1 %cmp27, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1 + %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4 + br label %for.body + +for.cond.cleanup.loopexit: + %minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %for.body ] + br label %for.cond.cleanup + +for.cond.cleanup: + %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ] + ret i32 %minmax.0.lcssa + +for.body: + %0 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ] + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %minmax.028 = phi i32 [ undef, %for.body.preheader ], [ %minmax.0.cond, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %1 = load i32, i32* %arrayidx, align 4 + %sub3 = sub nsw i32 %1, %0 + %cmp4 = icmp sgt i32 %sub3, 0 + %cond = select i1 %cmp4, i32 %sub3, i32 0 + %cmp5 = icmp slt i32 %minmax.028, %cond + %minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +; CHECK-LABEL: @pre_load_3 +; +; void pre_load_3(short *a, double *b, int n, float f, short p) { +; b[0] = (double)a[0] - f * (double)p; +; for (int i = 1; i < n; i++) +; b[i] = (double)a[i] - f * (double)a[i - 1]; +; } +; +; CHECK-NOT: load <{{.*}}> +; +; TODO: We currently do not vectorize this case because we do not attempt to +; determinte if we can sink %0 into the loop. Sinking this load requires +; more sophisticated analysis since it is not in the loop preheader and +; would cross a store. +; +define void @pre_load_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) { +entry: + %0 = load i16, i16* %a, align 2 + %conv = sitofp i16 %0 to double + %conv1 = fpext float %f to double + %conv2 = sitofp i16 %p to double + %mul = fmul fast double %conv2, %conv1 + %sub = fsub fast double %conv, %mul + store double %sub, double* %b, align 8 + %cmp25 = icmp sgt i32 %n, 1 + br i1 %cmp25, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %1 = phi i16 [ %2, %for.body ], [ %0, %for.body.preheader ] + %advars.iv = phi i64 [ %advars.iv.next, %for.body ], [ 1, %for.body.preheader ] + %arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv + %2 = load i16, i16* %arrayidx5, align 2 + %conv6 = sitofp i16 %2 to double + %conv11 = sitofp i16 %1 to double + %mul12 = fmul fast double %conv11, %conv1 + %sub13 = fsub fast double %conv6, %mul12 + %arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv + store double %sub13, double* %arrayidx15, align 8 + %advars.iv.next = add nuw nsw i64 %advars.iv, 1 + %lftr.wideiv = trunc i64 %advars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} +