Index: llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp =================================================================== --- llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -79,6 +79,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -215,10 +216,9 @@ /// /// NB! There may be conditions feeding into \p BI that aren't inductive range /// checks, and hence don't end up in \p Checks. - static void - extractRangeChecksFromBranch(BranchInst *BI, Loop *L, ScalarEvolution &SE, - BranchProbabilityInfo *BPI, - SmallVectorImpl &Checks); + static void extractRangeChecksFromBranch( + BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI, + SmallVectorImpl &Checks, bool &Changed); }; struct LoopStructure; @@ -365,16 +365,29 @@ void InductiveRangeCheck::extractRangeChecksFromBranch( BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI, - SmallVectorImpl &Checks) { + SmallVectorImpl &Checks, bool &Changed) { if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch()) return; + unsigned IndexLoopSucc = L->contains(BI->getSuccessor(0)) ? 0 : 1; + assert(L->contains(BI->getSuccessor(IndexLoopSucc)) && + "No edges coming to loop?"); BranchProbability LikelyTaken(15, 16); if (!SkipProfitabilityChecks && BPI && - BPI->getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken) + BPI->getEdgeProbability(BI->getParent(), IndexLoopSucc) < LikelyTaken) return; + // IRCE expects branch's true edge comes to loop. Invert branch for opposite + // case + if (IndexLoopSucc != 0) { + IRBuilder<> Builder(BI); + InvertBranch(BI, Builder); + if (BPI) + BPI->swapSuccEdgesProbabilities(BI->getParent()); + Changed = true; + } + SmallPtrSet Visited; InductiveRangeCheck::extractRangeChecksFromCond(L, SE, BI->getOperandUse(0), Checks, Visited); @@ -1840,14 +1853,15 @@ LLVMContext &Context = Preheader->getContext(); SmallVector RangeChecks; + bool Changed = false; for (auto *BBI : L->getBlocks()) if (BranchInst *TBI = dyn_cast(BBI->getTerminator())) InductiveRangeCheck::extractRangeChecksFromBranch(TBI, L, SE, BPI, - RangeChecks); + RangeChecks, Changed); if (RangeChecks.empty()) - return false; + return Changed; auto PrintRecognizedRangeChecks = [&](raw_ostream &OS) { OS << "irce: looking at loop "; L->print(OS); @@ -1868,11 +1882,11 @@ if (!MaybeLoopStructure) { LLVM_DEBUG(dbgs() << "irce: could not parse loop structure: " << FailureReason << "\n";); - return false; + return Changed; } LoopStructure LS = *MaybeLoopStructure; if (!isProfitableToTransform(*L, LS)) - return false; + return Changed; const SCEVAddRecExpr *IndVar = cast(SE.getMinusSCEV(SE.getSCEV(LS.IndVarBase), SE.getSCEV(LS.IndVarStep))); @@ -1903,12 +1917,13 @@ } if (!SafeIterRange) - return false; + return Changed; LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT, *SafeIterRange); - bool Changed = LC.run(); - if (Changed) { + if (LC.run()) { + Changed = true; + auto PrintConstrainedLoopInfo = [L]() { dbgs() << "irce: in function "; dbgs() << L->getHeader()->getParent()->getName() << ": "; Index: llvm/test/Transforms/IRCE/non-loop-invariant-rhs-instr.ll =================================================================== --- llvm/test/Transforms/IRCE/non-loop-invariant-rhs-instr.ll +++ llvm/test/Transforms/IRCE/non-loop-invariant-rhs-instr.ll @@ -25,8 +25,8 @@ ; CHECK: guarded: ; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ARRAY:%.*]], i64 [[INDVAR]] ; CHECK-NEXT: [[RES:%.*]] = load i32, ptr [[ADDR]], align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RES]], 0 -; CHECK-NEXT: br i1 [[CMP]], label [[ZERO_LOOPEXIT_LOOPEXIT4:%.*]], label [[LATCH]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[RES]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[LATCH]], label [[ZERO_LOOPEXIT_LOOPEXIT4:%.*]] ; CHECK: latch: ; CHECK-NEXT: [[INDVAR_NEXT]] = add nuw nsw i64 [[INDVAR]], 2 ; CHECK-NEXT: [[RES2:%.*]] = mul i32 [[RES]], 3 @@ -73,8 +73,8 @@ ; CHECK: guarded.postloop: ; CHECK-NEXT: [[ADDR_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[INDVAR_POSTLOOP]] ; CHECK-NEXT: [[RES_POSTLOOP:%.*]] = load i32, ptr [[ADDR_POSTLOOP]], align 4 -; CHECK-NEXT: [[CMP_POSTLOOP:%.*]] = icmp eq i32 [[RES_POSTLOOP]], 0 -; CHECK-NEXT: br i1 [[CMP_POSTLOOP]], label [[ZERO_LOOPEXIT_LOOPEXIT:%.*]], label [[LATCH_POSTLOOP]] +; CHECK-NEXT: [[CMP_POSTLOOP:%.*]] = icmp ne i32 [[RES_POSTLOOP]], 0 +; CHECK-NEXT: br i1 [[CMP_POSTLOOP]], label [[LATCH_POSTLOOP]], label [[ZERO_LOOPEXIT_LOOPEXIT:%.*]] ; CHECK: latch.postloop: ; CHECK-NEXT: [[INDVAR_NEXT_POSTLOOP]] = add nuw nsw i64 [[INDVAR_POSTLOOP]], 2 ; CHECK-NEXT: [[RES2_POSTLOOP]] = mul i32 [[RES_POSTLOOP]], 3 Index: llvm/test/Transforms/IRCE/stride_more_than_1.ll =================================================================== --- llvm/test/Transforms/IRCE/stride_more_than_1.ll +++ llvm/test/Transforms/IRCE/stride_more_than_1.ll @@ -702,7 +702,7 @@ } ; Same as test_09 but range check comparison is inversed. -; TODO: IRCE is allowed. +; IRCE is allowed. define i32 @test_10(ptr %p, ptr %capacity_p, ptr %num_elements_p) { ; CHECK-LABEL: define i32 @test_10 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[CAPACITY_P:%.*]], ptr [[NUM_ELEMENTS_P:%.*]]) { @@ -710,23 +710,66 @@ ; CHECK-NEXT: [[CAPACITY:%.*]] = load i32, ptr [[CAPACITY_P]], align 4, !range [[RNG16]] ; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = load i32, ptr [[NUM_ELEMENTS_P]], align 4, !range [[RNG16]] ; CHECK-NEXT: [[LIMIT:%.*]] = sub i32 [[CAPACITY]], 3 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[CAPACITY]], -3 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[CAPACITY]], 2147483646 +; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[TMP0]], [[SMAX]] +; CHECK-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[LIMIT]], i32 0) +; CHECK-NEXT: [[SMAX2:%.*]] = call i32 @llvm.smax.i32(i32 [[SMIN]], i32 -1) +; CHECK-NEXT: [[TMP3:%.*]] = add nsw i32 [[SMAX2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[SMIN3:%.*]] = call i32 @llvm.smin.i32(i32 [[NUM_ELEMENTS]], i32 [[TMP4]]) +; CHECK-NEXT: [[EXIT_MAINLOOP_AT:%.*]] = call i32 @llvm.smax.i32(i32 [[SMIN3]], i32 0) +; CHECK-NEXT: [[TMP5:%.*]] = icmp slt i32 0, [[EXIT_MAINLOOP_AT]] +; CHECK-NEXT: br i1 [[TMP5]], label [[LOOP_PREHEADER:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]] +; CHECK: loop.preheader: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] -; CHECK-NEXT: [[CAPACITY_CHECK:%.*]] = icmp sge i32 [[IV]], [[LIMIT]] -; CHECK-NEXT: br i1 [[CAPACITY_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]], !prof [[PROF19:![0-9]+]] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[CAPACITY_CHECK:%.*]] = icmp slt i32 [[IV]], [[LIMIT]] +; CHECK-NEXT: br i1 true, label [[BACKEDGE]], label [[OUT_OF_BOUNDS_LOOPEXIT5:%.*]], !prof [[PROF17]] ; CHECK: backedge: ; CHECK-NEXT: [[IV_WIDE:%.*]] = zext i32 [[IV]] to i64 ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i64 [[IV_WIDE]] ; CHECK-NEXT: store i32 1, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[NUM_ELEMENTS]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt i32 [[IV_NEXT]], [[EXIT_MAINLOOP_AT]] +; CHECK-NEXT: br i1 [[TMP6]], label [[LOOP]], label [[MAIN_EXIT_SELECTOR:%.*]] +; CHECK: main.exit.selector: +; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ] +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], [[BACKEDGE]] ] +; CHECK-NEXT: [[TMP7:%.*]] = icmp slt i32 [[IV_NEXT_LCSSA]], [[NUM_ELEMENTS]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MAIN_PSEUDO_EXIT]], label [[EXIT:%.*]] +; CHECK: main.pseudo.exit: +; CHECK-NEXT: [[IV_COPY:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT_LCSSA]], [[MAIN_EXIT_SELECTOR]] ] +; CHECK-NEXT: [[INDVAR_END:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT_LCSSA]], [[MAIN_EXIT_SELECTOR]] ] +; CHECK-NEXT: br label [[POSTLOOP:%.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: [[IV_LCSSA1_PH:%.*]] = phi i32 [ [[IV_POSTLOOP:%.*]], [[BACKEDGE_POSTLOOP:%.*]] ] +; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[IV_LCSSA1:%.*]] = phi i32 [ [[IV]], [[BACKEDGE]] ] +; CHECK-NEXT: [[IV_LCSSA1:%.*]] = phi i32 [ [[IV_LCSSA]], [[MAIN_EXIT_SELECTOR]] ], [ [[IV_LCSSA1_PH]], [[EXIT_LOOPEXIT:%.*]] ] ; CHECK-NEXT: ret i32 [[IV_LCSSA1]] +; CHECK: out_of_bounds.loopexit: +; CHECK-NEXT: br label [[OUT_OF_BOUNDS:%.*]] +; CHECK: out_of_bounds.loopexit5: +; CHECK-NEXT: br label [[OUT_OF_BOUNDS]] ; CHECK: out_of_bounds: ; CHECK-NEXT: ret i32 -1 +; CHECK: postloop: +; CHECK-NEXT: br label [[LOOP_POSTLOOP:%.*]] +; CHECK: loop.postloop: +; CHECK-NEXT: [[IV_POSTLOOP]] = phi i32 [ [[IV_COPY]], [[POSTLOOP]] ], [ [[IV_NEXT_POSTLOOP:%.*]], [[BACKEDGE_POSTLOOP]] ] +; CHECK-NEXT: [[CAPACITY_CHECK_POSTLOOP:%.*]] = icmp slt i32 [[IV_POSTLOOP]], [[LIMIT]] +; CHECK-NEXT: br i1 [[CAPACITY_CHECK_POSTLOOP]], label [[BACKEDGE_POSTLOOP]], label [[OUT_OF_BOUNDS_LOOPEXIT:%.*]], !prof [[PROF17]] +; CHECK: backedge.postloop: +; CHECK-NEXT: [[IV_WIDE_POSTLOOP:%.*]] = zext i32 [[IV_POSTLOOP]] to i64 +; CHECK-NEXT: [[EL_PTR_POSTLOOP:%.*]] = getelementptr i32, ptr [[P]], i64 [[IV_WIDE_POSTLOOP]] +; CHECK-NEXT: store i32 1, ptr [[EL_PTR_POSTLOOP]], align 4 +; CHECK-NEXT: [[IV_NEXT_POSTLOOP]] = add nuw nsw i32 [[IV_POSTLOOP]], 4 +; CHECK-NEXT: [[LOOP_COND_POSTLOOP:%.*]] = icmp slt i32 [[IV_NEXT_POSTLOOP]], [[NUM_ELEMENTS]] +; CHECK-NEXT: br i1 [[LOOP_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP19:![0-9]+]], !irce.loop.clone !6 ; entry: %capacity = load i32, ptr %capacity_p, !range !4