diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1559,6 +1559,21 @@ return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); } + /// Returns true if we're required to use a scalar epilogue for at least + /// the final iteration of the original loop for all VFs in \p Range. + /// A scalar epilogue must either be required for all VFs in \p Range or for + /// none. + bool requiresScalarEpilogue(VFRange Range) const { + auto RequiresScalarEpilogue = [this](ElementCount VF) { + return requiresScalarEpilogue(VF); + }; + bool IsRequired = all_of(Range, RequiresScalarEpilogue); + assert( + (IsRequired || none_of(Range, RequiresScalarEpilogue)) && + "all VFs in range must agree on whether a scalar epilogue is required"); + return IsRequired; + } + /// Returns true if a scalar epilogue is not allowed due to optsize or a /// loop hint annotation. bool isScalarEpilogueAllowed() const { @@ -3726,8 +3741,10 @@ // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. - Plan.clearLiveOuts(); } else { + // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking + // the cost model. + // If we inserted an edge from the middle block to the unique exit block, // update uses outside the loop (phis) to account for the newly inserted // edge. @@ -8993,7 +9010,12 @@ // After here, VPBB should not be used. VPBB = nullptr; - addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); + if (CM.requiresScalarEpilogue(Range)) { + // No edge from the middle block to the unique exit block has been inserted + // and there is nothing to fix from vector loop; phis should have incoming + // from scalar loop only. + } else + addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); assert(isa(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2425,12 +2425,6 @@ void addLiveOut(PHINode *PN, VPValue *V); - void clearLiveOuts() { - for (auto &KV : LiveOuts) - delete KV.second; - LiveOuts.clear(); - } - void removeLiveOut(PHINode *PN) { delete LiveOuts[PN]; LiveOuts.erase(PN); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -580,7 +580,9 @@ #endif VPlan::~VPlan() { - clearLiveOuts(); + for (auto &KV : LiveOuts) + delete KV.second; + LiveOuts.clear(); if (Entry) { VPValue DummyValue; diff --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll --- a/llvm/test/Transforms/LoopVectorize/loop-form.ll +++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll @@ -38,7 +38,7 @@ ; CHECK-NEXT: store i16 0, ptr [[B]], align 4 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: if.end: ; CHECK-NEXT: ret void ; @@ -92,7 +92,7 @@ ; TAILFOLD-NEXT: store i16 0, ptr [[B]], align 4 ; TAILFOLD-NEXT: [[INC]] = add nsw i32 [[I]], 1 ; TAILFOLD-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]] -; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP2:![0-9]+]] +; TAILFOLD-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP3:![0-9]+]] ; TAILFOLD: if.end: ; TAILFOLD-NEXT: ret void ; @@ -199,7 +199,6 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 ; CHECK-NEXT: store <2 x i32> , ptr [[TMP2]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998