diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -494,7 +494,8 @@ void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, bool InvariantCond, VPTransformState &State); - /// Fix the vectorized code, taking care of header phi's, live-outs, and more. + /// Fix the vectorized code, taking care of updating the branch condition in + /// the middle block, setting header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State); // Return true if any runtime check is added. @@ -732,8 +733,7 @@ Loop *L, Value *VectorTripCount, std::pair AdditionalBypass = {nullptr, nullptr}); - /// Complete the loop skeleton by adding debug MDs, creating appropriate - /// conditional branches in the middle block, preparing the builder and + /// Complete the loop skeleton by adding debug MDs, preparing the builder and /// running the verifier. Take in the vector loop \p L as argument, and return /// the preheader of the completed vector loop. BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); @@ -3399,13 +3399,20 @@ nullptr, Twine(Prefix) + "scalar.ph"); // Set up branch from middle block to the exit and scalar preheader blocks. - // completeLoopSkeleton will update the condition to use an iteration check, + // fixVectorizedLoop will update the condition to use an iteration check, // if required to decide whether to execute the remainder. - BranchInst *BrInst = - BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); + BranchInst *BrInst = BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, + Builder.getFalse()); auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); + // LoopExitBlock now has LoopMiddleBlock as new predecessor. Add a dummy + // incoming value, to keep the IR valid until it gets replaced with the + // concrete value after vectorization. The edge from LoopMiddleBlock to + // LoopExitBlock is guaranteed to be dead at least until the phi is updated + // with the concrete incoming value. + for (PHINode &P : LoopExitBlock->phis()) + P.addIncoming(UndefValue::get(P.getType()), LoopMiddleBlock); // We intentionally don't let SplitBlock to update LoopInfo since // LoopVectorBody should belong to another loop than LoopVectorPreHeader. @@ -3508,31 +3515,6 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, MDNode *OrigLoopID) { - assert(L && "Expected valid loop."); - - // The trip counts should be cached by now. - Value *Count = getOrCreateTripCount(L); - Value *VectorTripCount = getOrCreateVectorTripCount(L); - - auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); - - // Add a check in the middle block to see if we have completed - // all of the iterations in the first vector loop. - // If (N - N%VF) == N, then we *don't* need to run the remainder. - // If tail is to be folded, we know we don't need to run the remainder. - if (!Cost->foldTailByMasking()) { - Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, - Count, VectorTripCount, "cmp.n", - LoopMiddleBlock->getTerminator()); - - // Here we use the same DebugLoc as the scalar loop latch terminator instead - // of the corresponding compare because they may have ended up with - // different line numbers and we want to avoid awkward line stepping while - // debugging. Eg. if the compare has got a line number inside the loop. - CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); - cast(LoopMiddleBlock->getTerminator())->setCondition(CmpN); - } - // Get ready to start creating new instructions into the vectorized body. assert(LoopVectorPreHeader == L->getLoopPreheader() && "Inconsistent vector loop preheader"); @@ -3715,8 +3697,8 @@ // In this case, if IV1 has an external use, we need to avoid adding both // "last value of IV1" and "penultimate value of IV2". So, verify that we // don't already have an incoming value for the middle block. - if (PHI->getBasicBlockIndex(MiddleBlock) == -1) - PHI->addIncoming(I.second, MiddleBlock); + if (isa(PHI->getIncomingValueForBlock(MiddleBlock))) + PHI->setIncomingValueForBlock(MiddleBlock, I.second); } } @@ -4001,6 +3983,29 @@ } void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { + auto *L = LI->getLoopFor(LoopVectorBody); + // The trip counts should be cached by now. + Value *Count = getOrCreateTripCount(L); + Value *VectorTripCount = getOrCreateVectorTripCount(L); + auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); + Value *MiddleBlockCond = Builder.getTrue(); + // Create a check in the middle block to see if we have completed + // all of the iterations in the first vector loop. + // If (N - N%VF) == N, then we *don't* need to run the remainder. + // If tail is to be folded, we know we don't need to run the remainder. + if (!Cost->foldTailByMasking()) { + Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + Count, VectorTripCount, "cmp.n", + LoopMiddleBlock->getTerminator()); + + // Here we use the same DebugLoc as the scalar loop latch terminator instead + // of the corresponding compare because they may have ended up with + // different line numbers and we want to avoid awkward line stepping while + // debugging. Eg. if the compare has got a line number inside the loop. + CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); + MiddleBlockCond = CmpN; + } + // Insert truncates and extends for any truncated instructions as hints to // InstCombine. if (VF.isVector()) @@ -4019,9 +4024,6 @@ // This is the second stage of vectorizing recurrences. fixCrossIterationPHIs(State); - // Forget the original basic block. - PSE.getSE()->forgetLoop(OrigLoop); - // Fix-up external users of the induction variables. for (auto &Entry : Legal->getInductionVars()) fixupIVUsers(Entry.first, Entry.second, @@ -4032,6 +4034,15 @@ for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); + // Now all PHIs have been updated with the correct incoming values. Set the + // correct condition for the branch from the middle block to the scalar loop + // and exit block. + cast(LoopMiddleBlock->getTerminator()) + ->setCondition(MiddleBlockCond); + + // Forget the original basic block. + PSE.getSE()->forgetLoop(OrigLoop); + // Remove redundant induction instructions. cse(LoopVectorBody); @@ -4253,7 +4264,8 @@ for (PHINode &LCSSAPhi : LoopExitBlock->phis()) if (any_of(LCSSAPhi.incoming_values(), [Phi](Value *V) { return V == Phi; })) - LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); + LCSSAPhi.setIncomingValueForBlock(LoopMiddleBlock, + ExtractForPhiUsedOutsideLoop); } static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) { @@ -4430,7 +4442,7 @@ for (PHINode &LCSSAPhi : LoopExitBlock->phis()) if (any_of(LCSSAPhi.incoming_values(), [LoopExitInst](Value *V) { return V == LoopExitInst; })) - LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); + LCSSAPhi.setIncomingValueForBlock(LoopMiddleBlock, ReducedPartRdx); // Fix the scalar loop reduction variable with the incoming reduction sum // from the vector body and from the backedge value. @@ -4475,7 +4487,7 @@ void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { - if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) + if (!isa(LCSSAPhi.getIncomingValueForBlock(LoopMiddleBlock))) // Some phis were already hand updated by the reduction and recurrence // code above, leave them alone. continue; @@ -4497,7 +4509,7 @@ ? IncomingValue : State.get(State.Plan->getVPValue(IncomingValue), VPIteration(UF - 1, Lane)); - LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); + LCSSAPhi.setIncomingValueForBlock(LoopMiddleBlock, lastIncomingValue); } } diff --git a/llvm/test/Transforms/LoopVectorize/scev-verify-ir.ll b/llvm/test/Transforms/LoopVectorize/scev-verify-ir.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/scev-verify-ir.ll @@ -0,0 +1,77 @@ +; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -scev-verify-ir -S %s | FileCheck %s + +; Make sure SCEV is not queried while the IR is temporarily invalid. The tests +; deliberately do not check for details of the vectorized IR, because that's +; not the focus of the test. + +define void @pr49538() { +; CHECK-LABEL: @pr49538 +; CHECK: vector.body: +; +entry: + br label %loop.0 + +loop.0: + %iv.0 = phi i16 [ -1, %entry ], [ %iv.0.next, %loop.0.latch ] + br label %loop.1 + +loop.1: + %iv.1 = phi i16 [ -1, %loop.0 ], [ %iv.1.next, %loop.1 ] + %iv.1.next = add nsw i16 %iv.1, 1 + %i6 = icmp eq i16 %iv.1.next, %iv.0 + br i1 %i6, label %loop.0.latch, label %loop.1 + +loop.0.latch: + %i8 = phi i16 [ 1, %loop.1 ] + %iv.0.next = add nsw i16 %iv.0, 1 + %ec.0 = icmp eq i16 %iv.0.next, %i8 + br i1 %ec.0, label %exit, label %loop.0 + +exit: + ret void +} + +define void @pr49900(i32 %x, i64* %ptr) { +; CHECK-LABEL: @pr49900 +; CHECK: vector.body{{.*}}: +; CHECK: vector.body{{.*}}: +; +entry: + br label %loop.0 + +loop.0: ; preds = %bb2, %bb + %ec.0 = icmp slt i32 %x, 0 + br i1 %ec.0, label %loop.0, label %loop.1.ph + +loop.1.ph: ; preds = %bb2 + br label %loop.1 + +loop.1: ; preds = %bb33, %bb5 + %iv.1 = phi i32 [ 0, %loop.1.ph ], [ %iv.3.next, %loop.1.latch ] + br label %loop.2 + +loop.2: + %iv.2 = phi i32 [ %iv.1, %loop.1 ], [ %iv.2.next, %loop.2 ] + %tmp54 = add i32 %iv.2, 12 + %iv.2.next = add i32 %iv.2, 13 + %ext = zext i32 %iv.2.next to i64 + %tmp56 = add nuw nsw i64 %ext, 1 + %C6 = icmp sle i32 %tmp54, 65536 + br i1 %C6, label %loop.2, label %loop.3.ph + +loop.3.ph: + br label %loop.3 + +loop.3: + %iv.3 = phi i32 [ %iv.2.next, %loop.3.ph ], [ %iv.3.next, %loop.3 ] + %iv.3.next = add i32 %iv.3 , 13 + %C1 = icmp ult i32 %iv.3.next, 65536 + br i1 %C1, label %loop.3, label %loop.1.latch + +loop.1.latch: + %ec = icmp ne i32 %iv.1, 9999 + br i1 %ec, label %loop.1, label %exit + +exit: + ret void +}