diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -6681,7 +6681,7 @@ return nullptr; } -static std::optional> +static std::optional> canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT, const LoopInfo &LI) { if (!L->isInnermost()) { @@ -6743,6 +6743,7 @@ PHINode *ToHelpFold = nullptr; const SCEV *TermValueS = nullptr; + bool MustDropPoison = false; for (PHINode &PN : L->getHeader()->phis()) { if (ToFold == &PN) continue; @@ -6789,10 +6790,43 @@ << "\n"); continue; } + + // The candidate IV may have been otherwise dead and poison from the + // very first iteration. If we can't disprove that, we can't use the IV. + if (!mustExecuteUBIfPoisonOnPathTo(&PN, LoopLatch->getTerminator(), &DT)) { + LLVM_DEBUG(dbgs() << "Can not prove poison safety for IV " + << PN << "\n"); + continue; + } + + // The candidate IV may become poison on the last iteration. If this + // value is not branched on, this is a well defined program. We're + // about to add a new use to this IV, and we have to ensure we don't + // insert UB which didn't previously exist. + bool MustDropPoisonLocal = false; + Instruction *PostIncV = + cast(PN.getIncomingValueForBlock(LoopLatch)); + if (!mustExecuteUBIfPoisonOnPathTo(PostIncV, LoopLatch->getTerminator(), + &DT)) { + LLVM_DEBUG(dbgs() << "Can not prove poison safety to insert use" + << PN << "\n"); + + // If this is a complex recurrance with multiple instructions computing + // the backedge value, we might need to strip poison flags from all of + // them. + if (PostIncV->getOperand(0) != &PN) + continue; + + // In order to perform the transform, we need to drop the poison generating + // flags on this instruction (if any). + MustDropPoisonLocal = PostIncV->hasPoisonGeneratingFlags(); + } + // We pick the last legal alternate IV. We could expore choosing an optimal // alternate IV if we had a decent heuristic to do so. ToHelpFold = &PN; TermValueS = TermValueSLocal; + MustDropPoison = MustDropPoisonLocal; } LLVM_DEBUG(if (ToFold && !ToHelpFold) dbgs() @@ -6808,7 +6842,7 @@ if (!ToFold || !ToHelpFold) return std::nullopt; - return std::make_tuple(ToFold, ToHelpFold, TermValueS); + return std::make_tuple(ToFold, ToHelpFold, TermValueS, MustDropPoison); } static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, @@ -6871,7 +6905,7 @@ if (AllowTerminatingConditionFoldingAfterLSR) { if (auto Opt = canFoldTermCondOfLoop(L, SE, DT, LI)) { - auto [ToFold, ToHelpFold, TermValueS] = *Opt; + auto [ToFold, ToHelpFold, TermValueS, MustDrop] = *Opt; Changed = true; NumTermFold++; @@ -6889,6 +6923,10 @@ (void)StartValue; Value *LoopValue = ToHelpFold->getIncomingValueForBlock(LoopLatch); + // See comment in canFoldTermCondOfLoop on why this is sufficient. + if (MustDrop) + cast(LoopValue)->dropPoisonGeneratingFlags(); + // SCEVExpander for both use in preheader and latch const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); SCEVExpander Expander(SE, DL, "lsr_fold_term_cond"); diff --git a/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll b/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll --- a/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll @@ -124,7 +124,7 @@ ; CHECK-NEXT: [[IT_04:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[START_PTRPTR]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[IT_04]], align 8 ; CHECK-NEXT: tail call void @foo(ptr [[TMP4]]) -; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds ptr, ptr [[IT_04]], i64 1 +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr ptr, ptr [[IT_04]], i64 1 ; CHECK-NEXT: [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[SCEVGEP]] ; CHECK-NEXT: br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: @@ -170,7 +170,7 @@ ; CHECK-NEXT: [[DST_04:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[MARK]], [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DST_04]], align 8 ; CHECK-NEXT: [[TMP5:%.*]] = call ptr @foo(ptr [[TMP4]]) -; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds ptr, ptr [[DST_04]], i64 1 +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr ptr, ptr [[DST_04]], i64 1 ; CHECK-NEXT: [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[SCEVGEP]] ; CHECK-NEXT: br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] ; @@ -197,9 +197,9 @@ ; advance the pointer IV by *4* each time, and thus on the iteration we write ; byte 16, %uglygep2 (the pointer increment) is past the end of the underlying ; storage and thus violates the inbounds requirements. As a result, %uglygep2 -; is poison on the final iteration. If we insert a branch on that value, we -; have inserted undefined behavior where it did not previously exist. -; FIXME: miscompile +; is poison on the final iteration. If we insert a branch on that value +; (without stripping the poison flag), we have inserted undefined behavior +; where it did not previously exist. define void @inbounds_poison_use(ptr %a) { ; CHECK-LABEL: @inbounds_poison_use( ; CHECK-NEXT: entry: @@ -208,7 +208,7 @@ ; CHECK: for.body: ; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[UGLYGEP2:%.*]], [[FOR_BODY]] ], [ [[A]], [[ENTRY:%.*]] ] ; CHECK-NEXT: store i8 1, ptr [[LSR_IV1]], align 4 -; CHECK-NEXT: [[UGLYGEP2]] = getelementptr inbounds i8, ptr [[LSR_IV1]], i64 4 +; CHECK-NEXT: [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4 ; CHECK-NEXT: [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[UGLYGEP2]], [[SCEVGEP]] ; CHECK-NEXT: br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: