diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3720,12 +3720,7 @@ VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); - if (Cost->requiresScalarEpilogue(VF)) { - // No edge from the middle block to the unique exit block has been inserted - // and there is nothing to fix from vector loop; phis should have incoming - // from scalar loop only. - Plan.clearLiveOuts(); - } else { + if (!Cost->requiresScalarEpilogue(VF)) { // If we inserted an edge from the middle block to the unique exit block, // update uses outside the loop (phis) to account for the newly inserted // edge. @@ -8808,7 +8803,14 @@ // original exit block. static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, VPBasicBlock *MiddleVPBB, Loop *OrigLoop, - VPlan &Plan) { + VPlan &Plan, bool RequiresScalarEpilogue) { + if (RequiresScalarEpilogue) { + // No edge from the middle block to the unique exit block has been inserted + // and there is nothing to fix from vector loop; phis should have incoming + // from scalar loop only. + + return; + } BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); // Only handle single-exit loops with unique exit blocks for now. @@ -8984,7 +8986,15 @@ // After here, VPBB should not be used. VPBB = nullptr; - addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); + auto RequiresScalarEpilogue = [this](ElementCount VF) { + return CM.requiresScalarEpilogue(VF); + }; + assert( + (all_of(Range, RequiresScalarEpilogue) || + none_of(Range, RequiresScalarEpilogue)) && + "all VFs in range must agree on whether a scalar epilogue is required"); + addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan, + all_of(Range, RequiresScalarEpilogue)); assert(isa(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2423,12 +2423,6 @@ void addLiveOut(PHINode *PN, VPValue *V); - void clearLiveOuts() { - for (auto &KV : LiveOuts) - delete KV.second; - LiveOuts.clear(); - } - void removeLiveOut(PHINode *PN) { delete LiveOuts[PN]; LiveOuts.erase(PN); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -580,7 +580,9 @@ #endif VPlan::~VPlan() { - clearLiveOuts(); + for (auto &KV : LiveOuts) + delete KV.second; + LiveOuts.clear(); if (Entry) { VPValue DummyValue; diff --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll --- a/llvm/test/Transforms/LoopVectorize/loop-form.ll +++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll @@ -38,7 +38,7 @@ ; CHECK-NEXT: store i16 0, ptr [[B]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: if.end: ; CHECK-NEXT: ret void ; @@ -92,7 +92,7 @@ ; TAILFOLD-NEXT: store i16 0, ptr [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]] -; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP2:![0-9]+]] +; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP3:![0-9]+]] ; TAILFOLD: if.end: ; TAILFOLD-NEXT: ret void ; @@ -199,7 +199,6 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 ; CHECK-NEXT: store <2 x i32> , ptr [[TMP2]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998