Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -363,6 +363,12 @@ BasicBlock *InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA); +// These are general cross iteration phis in the header. They do not fit into +// the reduction/induction/first order recurrence phis. We identify such phis +// when their incoming value from the loop is also a phi node (coming from the +// latch). +bool isCrossIterationPhiInHeader(PHINode *Phi, Loop *TheLoop); + /// Ensure that all exit blocks of the loop are dedicated exits. /// /// For any loop exit block with non-loop predecessors, we split the loop Index: include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -256,6 +256,8 @@ /// Return the set of instructions to sink to handle first-order recurrences. DenseMap &getSinkAfter() { return SinkAfter; } + RecurrenceSet *getCrossIterationPhisInHeader() { return &CrossIterationPhisInHeader; } + /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } @@ -278,6 +280,10 @@ /// Returns True if Phi is a first-order recurrence in this loop. bool isFirstOrderRecurrence(const PHINode *Phi); + bool isCrossIterationPhiInHeader(const PHINode *PN) { + return CrossIterationPhisInHeader.count(PN); + } + /// Return true if the block BB needs to be predicated in order for the loop /// to be vectorized. bool blockNeedsPredication(BasicBlock *BB); @@ -444,6 +450,12 @@ /// Holds the phi nodes that are first-order recurrences. RecurrenceSet FirstOrderRecurrences; + /// These are header phis that are neither induction, reduction or first + /// order recurrences. However, they fit the general recurrence pattern with + /// incoming value from the loop belonging to an if-convertable phi node in + /// the latch. + RecurrenceSet CrossIterationPhisInHeader; + /// Holds instructions that need to sink past other instructions to handle /// first-order recurrences. DenseMap SinkAfter; Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -603,10 +603,7 @@ return false; } -bool RecurrenceDescriptor::isFirstOrderRecurrence( - PHINode *Phi, Loop *TheLoop, - DenseMap &SinkAfter, DominatorTree *DT) { - +static bool isValidPhiForRecurrence(PHINode *Phi, Loop *TheLoop) { // Ensure the phi node is in the loop header and has two incoming values. if (Phi->getParent() != TheLoop->getHeader() || Phi->getNumIncomingValues() != 2) @@ -623,6 +620,52 @@ if (Phi->getBasicBlockIndex(Preheader) < 0 || Phi->getBasicBlockIndex(Latch) < 0) return false; + return true; +} + +bool llvm::isCrossIterationPhiInHeader(PHINode *Phi, Loop *TheLoop) { + if (!isValidPhiForRecurrence(Phi, TheLoop)) + return false; + + auto *Latch = TheLoop->getLoopLatch(); + + // Get the previous value. The previous value comes from the latch edge while + // the initial value comes form the preheader edge. + auto *Previous = dyn_cast(Phi->getIncomingValueForBlock(Latch)); + if (!Previous || !TheLoop->contains(Previous)) + return false; + + // Now look at the properties of Previous. + // It should be if-convertable phi node - for now we only handle two element + // phi nodes. + if (Previous->getNumIncomingValues() != 2) + return false; + + auto hasOutsideLoopUsers = [TheLoop](PHINode *P) { + // Check that all of the users of the loop are inside the BB. + for (User *U : P->users()) { + Instruction *UI = cast(U); + if (!TheLoop->contains(UI)) + return true; + } + return false; + }; + + // For now, Previous and Phi should not have outside uses. + if (hasOutsideLoopUsers(Phi) || hasOutsideLoopUsers(Previous)) + return false; + + return true; +} + +bool RecurrenceDescriptor::isFirstOrderRecurrence( + PHINode *Phi, Loop *TheLoop, + DenseMap &SinkAfter, DominatorTree *DT) { + + if (!isValidPhiForRecurrence(Phi, TheLoop)) + return false; + + auto *Latch = TheLoop->getLoopLatch(); // Get the previous value. The previous value comes from the latch edge while // the initial value comes form the preheader edge. Index: lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -646,6 +646,13 @@ continue; } + // If nothing worked, see if it's a phi feeding from another + // if-convertable phi in the latch. + if (llvm::isCrossIterationPhiInHeader(Phi, TheLoop)) { + CrossIterationPhisInHeader.insert(Phi); + continue; + } + ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi) << "value that could not be identified as " "reduction is used outside the loop"); Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -476,10 +476,22 @@ /// this phi node. void fixFirstOrderRecurrence(PHINode *Phi); + /// Fix up the scalar post loop by passing in the recurrence phi and the first + /// value for the scalar loop, i.e. ScalarInit. + void fixScalarLoopAndOutsideUsesForRecurrence(PHINode &Phi, + Value *VectorInitForScalar, + Value *ScalarInit, + Value *Previous); + /// Fix a reduction cross-iteration phi. This is the second phase of /// vectorizing this phi node. void fixReduction(PHINode *Phi); + /// Fix the general vectorizable phi in header case. These are general + /// recurrences that feed from an (if-convertable) phi node in the latch. This is the second phase + /// of vectorizing this phi node. + void fixCrossIterationPhiInHeader(PHINode *Phi); + /// The Loop exit block may have single value PHI nodes with some /// incoming value. While vectorizing we only handled real values /// that were defined inside the loop and we should have one value for @@ -3373,7 +3385,117 @@ fixFirstOrderRecurrence(&Phi); else if (Legal->isReductionVariable(&Phi)) fixReduction(&Phi); + else if (Legal->isCrossIterationPhiInHeader(&Phi)) + fixCrossIterationPhiInHeader(&Phi); + } +} + +void InnerLoopVectorizer::fixScalarLoopAndOutsideUsesForRecurrence( + PHINode &Phi, Value *VectorInitForScalar, Value *ScalarInit, + Value *Previous) { + // Extract the last vector element in the middle block. This will be the + // initial value for the recurrence when jumping to the scalar loop. + auto *ExtractForScalar = VectorInitForScalar; + if (VF > 1) { + Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); + ExtractForScalar = Builder.CreateExtractElement( + ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); + } + // Extract the second last element in the middle block if the + // Phi is used outside the loop. We need to extract the phi itself + // and not the last element (the phi update in the current iteration). This + // will be the value when jumping to the exit block from the LoopMiddleBlock, + // when the scalar loop is not run at all. + Value *ExtractForPhiUsedOutsideLoop = nullptr; + if (VF > 1) + ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( + VectorInitForScalar, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); + // When loop is unrolled without vectorizing, initialize + // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of + // `Incoming`. This is analogous to the vectorized case above: extracting the + // second last element when VF > 1. + else if (UF > 1) + ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); + + // Fix the initial value of the original recurrence in the scalar loop. + Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); + auto *Start = Builder.CreatePHI(Phi.getType(), 2, "scalar.recur.init"); + for (auto *BB : predecessors(LoopScalarPreHeader)) { + auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; + Start->addIncoming(Incoming, BB); + } + + Phi.setIncomingValue(Phi.getBasicBlockIndex(LoopScalarPreHeader), Start); + Phi.setName("scalar.recur"); + + // Finally, fix users of the recurrence outside the loop. The users will need + // either the last value of the scalar recurrence or the last value of the + // vector recurrence we extracted in the middle block. Since the loop is in + // LCSSA form, we just need to find all the phi nodes for the original scalar + // recurrence in the exit block, and then add an edge for the middle block. + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { + if (LCSSAPhi.getIncomingValue(0) == &Phi) { + LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); + } + } +} + +void InnerLoopVectorizer::fixCrossIterationPhiInHeader(PHINode *Phi) { + + auto *Preheader = OrigLoop->getLoopPreheader(); + auto *Latch = OrigLoop->getLoopLatch(); + + // Get the initial and previous values of the scalar recurrence. + auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); + auto *Previous = Phi->getIncomingValueForBlock(Latch); + + assert(isa(Previous) && "value from latch should be a phi!"); + // Create a vector from the initial value. + auto *VectorInit = ScalarInit; + if (VF > 1) { + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + VectorInit = Builder.CreateInsertElement( + UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, + Builder.getInt32(VF - 1), "vector.recur.init"); + } + + // We constructed a temporary phi node in the first phase of vectorization. + // This phi node will eventually be deleted. + Builder.SetInsertPoint( + cast(VectorLoopValueMap.getVectorValue(Phi, 0))); + + // Create a phi node for the new recurrence. The current value will either be + // the initial value inserted into a vector or loop-varying vector value. + auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); + VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); + + // Get the vectorized previous value of the last part UF - 1. It appears last + // among all unrolled iterations, due to the order of their construction. + Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); + VecPhi->addIncoming(PreviousLastPart, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); + + // The vector from which to take the initial value for the current iteration + // (actual or unrolled). Initially, this is the vector phi node. + Value *Incoming = VecPhi; + + // Update the vector parts. + for (unsigned Part = 0; Part < UF; ++Part) { + Value *PreviousPart = getOrCreateVectorValue(Previous, Part); + Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); + PhiPart->replaceAllUsesWith(Incoming); + cast(PhiPart)->eraseFromParent(); + VectorLoopValueMap.resetVectorValue(Phi, Part, PreviousPart); + Incoming = PreviousPart; } + + fixScalarLoopAndOutsideUsesForRecurrence(*Phi, Incoming, ScalarInit, Previous); + // assert this vector phi has no uses outside the loop. +#ifdef NDEBUG + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) + assert((LCSSAPhi.getIncomingValue(0) != Phi && + LCSSAPhi.getIncomingValue(0) != Previous) && + "no user expected outside loop!"); +#endif } void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { @@ -3497,51 +3619,8 @@ // Fix the latch value of the new recurrence in the vector loop. VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); - // Extract the last vector element in the middle block. This will be the - // initial value for the recurrence when jumping to the scalar loop. - auto *ExtractForScalar = Incoming; - if (VF > 1) { - Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); - ExtractForScalar = Builder.CreateExtractElement( - ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); - } - // Extract the second last element in the middle block if the - // Phi is used outside the loop. We need to extract the phi itself - // and not the last element (the phi update in the current iteration). This - // will be the value when jumping to the exit block from the LoopMiddleBlock, - // when the scalar loop is not run at all. - Value *ExtractForPhiUsedOutsideLoop = nullptr; - if (VF > 1) - ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( - Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); - // When loop is unrolled without vectorizing, initialize - // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of - // `Incoming`. This is analogous to the vectorized case above: extracting the - // second last element when VF > 1. - else if (UF > 1) - ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); + fixScalarLoopAndOutsideUsesForRecurrence(*Phi, Incoming, ScalarInit, Previous); - // Fix the initial value of the original recurrence in the scalar loop. - Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); - auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); - for (auto *BB : predecessors(LoopScalarPreHeader)) { - auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; - Start->addIncoming(Incoming, BB); - } - - Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start); - Phi->setName("scalar.recur"); - - // Finally, fix users of the recurrence outside the loop. The users will need - // either the last value of the scalar recurrence or the last value of the - // vector recurrence we extracted in the middle block. Since the loop is in - // LCSSA form, we just need to find all the phi nodes for the original scalar - // recurrence in the exit block, and then add an edge for the middle block. - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { - if (LCSSAPhi.getIncomingValue(0) == Phi) { - LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); - } - } } void InnerLoopVectorizer::fixReduction(PHINode *Phi) { @@ -3801,7 +3880,8 @@ // Phi nodes have cycles, so we need to vectorize them in two stages. This is // stage #1: We create a new vector PHI node with no incoming edges. We'll use // this value when we vectorize all of the instructions that use the PHI. - if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { + if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P) || + Legal->isCrossIterationPhiInHeader(P)) { for (unsigned Part = 0; Part < UF; ++Part) { // This is phase one of vectorizing PHIs. Type *VecTy = Index: test/Transforms/LoopVectorize/vectorize-general-recurrence.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/vectorize-general-recurrence.ll @@ -0,0 +1,120 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -dce -instcombine -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +declare i8 @foo() +declare i64 @bar() + +; general recurrence predicated with selects. +; hdrphi1 = boolarr[i+c] == 1 ? boolarr[i+c] + (hdrphi2 == 0 ? hdrphi1 : 0) : (!(hdrphi2 == 0) ? hdrphi1 : -1) +define i64 @recurrence_with_select(i64 %N, i64 %c, i8 * %boolarr) { +;CHECK-LABEL: @recurrence_with_select( +; CHECK-LABEL: vector.body: +; CHECK: phi <4 x i64> +; CHECK: phi <4 x i8> +; CHECK: load <4 x i8> +; CHECK-LABEL: middle.block: +; CHECK: [[E1:%[a-zA-Z0-9.]+]] = extractelement <4 x i64> +; CHECK: [[E2:%[a-zA-Z0-9.]+]] = extractelement <4 x i8> +; CHECK-LABEL: scalar.ph: +; CHECK: [[SRI1:%[a-zA-Z0-9.]+]] = phi i8 [ [[E2]], %middle.block ], [ %tmp, %prehdr ] +; CHECK: [[SRI2:%[a-zA-Z0-9.]+]] = phi i64 [ [[E1]], %middle.block ], [ %hdrphi1.ph.init, %prehdr ] +; CHECK-LABEL: hdr: +; CHECK: phi i64 [ %phiuseout2, %latch ], [ [[SRI2]], %scalar.ph ] +; CHECK: phi i8 [ %phiuseout1, %latch ], [ [[SRI1]], %scalar.ph ] + +prehdr: + %tmp = call i8 @foo() + %hdrphi1.ph.init = call i64 @bar() + br label %hdr + +hdr: ; preds = %latch, %prehdr + %hdrphi1 = phi i64 [ %phiuseout2, %latch ], [ %hdrphi1.ph.init, %prehdr ] + %hdrphi2 = phi i8 [ %phiuseout1, %latch ], [ %tmp, %prehdr ] + %iv = phi i64 [ %iv.next, %latch ], [ 0, %prehdr ] + %iv.derived = add nuw nsw i64 %iv, %c + %tmp17 = getelementptr inbounds i8, i8 * %boolarr, i64 %iv.derived + %boolarrload = load i8, i8 * %tmp17, align 1 + %tmp34 = icmp eq i8 %hdrphi2, 0 + %tmp19 = icmp eq i8 %boolarrload, 1 + br i1 %tmp19, label %bb36, label %bb46 + +bb36: ; preds = %bb30 + %tmp41 = select i1 %tmp34, i64 %hdrphi1, i64 0 + %tmp44 = sext i8 %boolarrload to i64 + %tmp45 = add i64 %tmp41, %tmp44 + br label %latch + +bb46: ; preds = %bb30, %bb28 + %tmp48 = xor i1 %tmp34, true + %tmp49 = zext i1 %tmp48 to i8 + %tmp50 = select i1 %tmp48, i64 %hdrphi1, i64 -1 + br label %latch + +latch: ; preds = %bb46, %bb36 + %phiuseout1 = phi i8 [ 0, %bb36 ], [ %tmp49, %bb46 ] + %phiuseout2 = phi i64 [ %tmp45, %bb36 ], [ %tmp50, %bb46 ] + %iv.next = add nuw nsw i64 %iv, 1 + %tmp55 = icmp ult i64 %iv.next, %N + br i1 %tmp55, label %hdr, label %bb56 + +bb56: ; preds = %latch + %phi.iv.lcssa = phi i64 [ %iv.next, %latch ] + ret i64 %phi.iv.lcssa +} + +; predicated stores in the loop +define void @recurrence_with_select_and_store(i64 %N, i64 %c, i8 * %boolarr, i64 * %array) { +; CHECK-LABEL: vector.memcheck: +; CHECK: br i1 %found.conflict, label %scalar.ph, label %vector.ph + +; CHECK-LABEL: vector.body: +; CHECK: phi <4 x i64> +; CHECK: phi <4 x i8> + +; CHECK-LABEL: middle.block: +; CHECK: [[E1:%[a-zA-Z0-9.]+]] = extractelement <4 x i64> +; CHECK: [[E2:%[a-zA-Z0-9.]+]] = extractelement <4 x i8> + +; CHECK-LABEL: scalar.ph: +; CHECK: [[SRI1:%[a-zA-Z0-9.]+]] = phi i8 [ [[E2]], %middle.block ], [ %tmp, %vector.memcheck ], [ %tmp, %prehdr ] +; CHECK: [[SRI2:%[a-zA-Z0-9.]+]] = phi i64 [ [[E1]], %middle.block ], [ %hdrphi1.ph.init, %vector.memcheck ], [ %hdrphi1.ph.init, %prehdr ] +prehdr: + %tmp = call i8 @foo() + %hdrphi1.ph.init = call i64 @bar() + br label %hdr + +hdr: ; preds = %latch, %prehdr + %hdrphi1 = phi i64 [ %phiuseout2, %latch ], [ %hdrphi1.ph.init, %prehdr ] + %hdrphi2 = phi i8 [ %phiuseout1, %latch ], [ %tmp, %prehdr ] + %iv = phi i64 [ %iv.next, %latch ], [ 0, %prehdr ] + %iv.derived = add nuw nsw i64 %iv, %c + %tmp17 = getelementptr inbounds i8, i8 * %boolarr, i64 %iv.derived + %boolarrload = load i8, i8 * %tmp17, align 1 + %tmp34 = icmp eq i8 %hdrphi2, 0 + %tmp19 = icmp eq i8 %boolarrload, 1 + br i1 %tmp19, label %bb36, label %bb46 + +bb36: ; preds = %bb30 + %tmp41 = select i1 %tmp34, i64 %hdrphi1, i64 0 + %tmp44 = sext i8 %boolarrload to i64 + %tmp45 = add i64 %tmp41, %tmp44 + br label %latch + +bb46: ; preds = %bb30, %bb28 + %tmp48 = xor i1 %tmp34, true + %tmp49 = zext i1 %tmp48 to i8 + %tmp50 = select i1 %tmp48, i64 %hdrphi1, i64 -1 + %addr = getelementptr inbounds i64, i64 * %array, i64 %iv.derived + store i64 %tmp50, i64* %addr, align 1 + br label %latch + +latch: ; preds = %bb46, %bb36 + %phiuseout1 = phi i8 [ 0, %bb36 ], [ %tmp49, %bb46 ] + %phiuseout2 = phi i64 [ %tmp45, %bb36 ], [ %tmp50, %bb46 ] + %iv.next = add nuw nsw i64 %iv, 1 + %tmp55 = icmp ult i64 %iv.next, %N + br i1 %tmp55, label %hdr, label %bb56 + +bb56: ; preds = %latch + ret void +}