diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp --- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -739,6 +739,19 @@ SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit)); } +/// Returns estimate for max latch taken count of the loop of the narrowest +/// available type. If the latch block has such estimate, it is returned. +/// Otherwise, we use max exit count of whole loop (that is potentially of wider +/// type than latch check itself), which is still better than no estimate. +static const SCEV *getNarrowestLatchMaxTakenCountEstimate(ScalarEvolution &SE, + const Loop &L) { + const SCEV *FromBlock = + SE.getExitCount(&L, L.getLoopLatch(), ScalarEvolution::SymbolicMaximum); + if (isa(FromBlock)) + return SE.getSymbolicMaxBackedgeTakenCount(&L); + return FromBlock; +} + std::optional LoopStructure::parseLoopStructure(ScalarEvolution &SE, Loop &L, const char *&FailureReason) { @@ -781,12 +794,12 @@ return std::nullopt; } - const SCEV *LatchCount = SE.getExitCount(&L, Latch); - if (isa(LatchCount)) { + const SCEV *MaxBETakenCount = getNarrowestLatchMaxTakenCountEstimate(SE, L); + if (isa(MaxBETakenCount)) { FailureReason = "could not compute latch count"; return std::nullopt; } - assert(SE.getLoopDisposition(LatchCount, &L) == + assert(SE.getLoopDisposition(MaxBETakenCount, &L) == ScalarEvolution::LoopInvariant && "loop variant exit count doesn't make sense!"); @@ -1392,12 +1405,12 @@ bool LoopConstrainer::run() { BasicBlock *Preheader = nullptr; - const SCEV *LatchTakenCount = - SE.getExitCount(&OriginalLoop, MainLoopStructure.Latch); + const SCEV *MaxBETakenCount = + getNarrowestLatchMaxTakenCountEstimate(SE, OriginalLoop); Preheader = OriginalLoop.getLoopPreheader(); - assert(!isa(LatchTakenCount) && Preheader != nullptr && + assert(!isa(MaxBETakenCount) && Preheader != nullptr && "preconditions!"); - ExitCountTy = cast(LatchTakenCount->getType()); + ExitCountTy = cast(MaxBETakenCount->getType()); OriginalPreheader = Preheader; MainLoopPreheader = Preheader; diff --git a/llvm/test/Transforms/IRCE/stride_more_than_1.ll b/llvm/test/Transforms/IRCE/stride_more_than_1.ll --- a/llvm/test/Transforms/IRCE/stride_more_than_1.ll +++ b/llvm/test/Transforms/IRCE/stride_more_than_1.ll @@ -598,7 +598,7 @@ ret void } -; TODO: IRCE is legal here. +; IRCE is legal here. ; Here how it is done if the step was 1: https://godbolt.org/z/jEqWaseWc ; It is also legal for step 4. Proof: ; - Capacity check ensures that iv < limit <= SINT_MAX - 3, meaning that @@ -614,23 +614,66 @@ ; CHECK-NEXT: [[CAPACITY:%.*]] = load i32, ptr [[CAPACITY_P]], align 4, !range [[RNG16:![0-9]+]] ; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = load i32, ptr [[NUM_ELEMENTS_P]], align 4, !range [[RNG16]] ; CHECK-NEXT: [[LIMIT:%.*]] = sub i32 [[CAPACITY]], 3 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[CAPACITY]], -3 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[CAPACITY]], 2147483646 +; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0) +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[TMP0]], [[SMAX]] +; CHECK-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[LIMIT]], i32 0) +; CHECK-NEXT: [[SMAX2:%.*]] = call i32 @llvm.smax.i32(i32 [[SMIN]], i32 -1) +; CHECK-NEXT: [[TMP3:%.*]] = add nsw i32 [[SMAX2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[SMIN3:%.*]] = call i32 @llvm.smin.i32(i32 [[NUM_ELEMENTS]], i32 [[TMP4]]) +; CHECK-NEXT: [[EXIT_MAINLOOP_AT:%.*]] = call i32 @llvm.smax.i32(i32 [[SMIN3]], i32 0) +; CHECK-NEXT: [[TMP5:%.*]] = icmp slt i32 0, [[EXIT_MAINLOOP_AT]] +; CHECK-NEXT: br i1 [[TMP5]], label [[LOOP_PREHEADER:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]] +; CHECK: loop.preheader: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[LOOP_PREHEADER]] ] ; CHECK-NEXT: [[CAPACITY_CHECK:%.*]] = icmp slt i32 [[IV]], [[LIMIT]] -; CHECK-NEXT: br i1 [[CAPACITY_CHECK]], label [[BACKEDGE]], label [[OUT_OF_BOUNDS:%.*]], !prof [[PROF17:![0-9]+]] +; CHECK-NEXT: br i1 true, label [[BACKEDGE]], label [[OUT_OF_BOUNDS_LOOPEXIT5:%.*]], !prof [[PROF17:![0-9]+]] ; CHECK: backedge: ; CHECK-NEXT: [[IV_WIDE:%.*]] = zext i32 [[IV]] to i64 ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i64 [[IV_WIDE]] ; CHECK-NEXT: store i32 1, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[NUM_ELEMENTS]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt i32 [[IV_NEXT]], [[EXIT_MAINLOOP_AT]] +; CHECK-NEXT: br i1 [[TMP6]], label [[LOOP]], label [[MAIN_EXIT_SELECTOR:%.*]] +; CHECK: main.exit.selector: +; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ] +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], [[BACKEDGE]] ] +; CHECK-NEXT: [[TMP7:%.*]] = icmp slt i32 [[IV_NEXT_LCSSA]], [[NUM_ELEMENTS]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MAIN_PSEUDO_EXIT]], label [[EXIT:%.*]] +; CHECK: main.pseudo.exit: +; CHECK-NEXT: [[IV_COPY:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT_LCSSA]], [[MAIN_EXIT_SELECTOR]] ] +; CHECK-NEXT: [[INDVAR_END:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT_LCSSA]], [[MAIN_EXIT_SELECTOR]] ] +; CHECK-NEXT: br label [[POSTLOOP:%.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: [[IV_LCSSA1_PH:%.*]] = phi i32 [ [[IV_POSTLOOP:%.*]], [[BACKEDGE_POSTLOOP:%.*]] ] +; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[IV_LCSSA1:%.*]] = phi i32 [ [[IV]], [[BACKEDGE]] ] +; CHECK-NEXT: [[IV_LCSSA1:%.*]] = phi i32 [ [[IV_LCSSA]], [[MAIN_EXIT_SELECTOR]] ], [ [[IV_LCSSA1_PH]], [[EXIT_LOOPEXIT:%.*]] ] ; CHECK-NEXT: ret i32 [[IV_LCSSA1]] +; CHECK: out_of_bounds.loopexit: +; CHECK-NEXT: br label [[OUT_OF_BOUNDS:%.*]] +; CHECK: out_of_bounds.loopexit5: +; CHECK-NEXT: br label [[OUT_OF_BOUNDS]] ; CHECK: out_of_bounds: ; CHECK-NEXT: ret i32 -1 +; CHECK: postloop: +; CHECK-NEXT: br label [[LOOP_POSTLOOP:%.*]] +; CHECK: loop.postloop: +; CHECK-NEXT: [[IV_POSTLOOP]] = phi i32 [ [[IV_COPY]], [[POSTLOOP]] ], [ [[IV_NEXT_POSTLOOP:%.*]], [[BACKEDGE_POSTLOOP]] ] +; CHECK-NEXT: [[CAPACITY_CHECK_POSTLOOP:%.*]] = icmp slt i32 [[IV_POSTLOOP]], [[LIMIT]] +; CHECK-NEXT: br i1 [[CAPACITY_CHECK_POSTLOOP]], label [[BACKEDGE_POSTLOOP]], label [[OUT_OF_BOUNDS_LOOPEXIT:%.*]], !prof [[PROF17]] +; CHECK: backedge.postloop: +; CHECK-NEXT: [[IV_WIDE_POSTLOOP:%.*]] = zext i32 [[IV_POSTLOOP]] to i64 +; CHECK-NEXT: [[EL_PTR_POSTLOOP:%.*]] = getelementptr i32, ptr [[P]], i64 [[IV_WIDE_POSTLOOP]] +; CHECK-NEXT: store i32 1, ptr [[EL_PTR_POSTLOOP]], align 4 +; CHECK-NEXT: [[IV_NEXT_POSTLOOP]] = add nuw nsw i32 [[IV_POSTLOOP]], 4 +; CHECK-NEXT: [[LOOP_COND_POSTLOOP:%.*]] = icmp slt i32 [[IV_NEXT_POSTLOOP]], [[NUM_ELEMENTS]] +; CHECK-NEXT: br i1 [[LOOP_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP18:![0-9]+]], !irce.loop.clone !6 ; entry: %capacity = load i32, ptr %capacity_p, !range !4