Index: lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp =================================================================== --- lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -116,6 +116,11 @@ static cl::opt AllowUnsignedLatchCondition("irce-allow-unsigned-latch", cl::Hidden, cl::init(true)); +static cl::opt AllowNarrowLatchCondition( + "irce-allow-narrow-latch", cl::Hidden, cl::init(true), + cl::desc("If set to true, IRCE may eliminate wide range checks in loops " + "with narrow latch condition.")); + static const char *ClonedLoopTag = "irce.loop.clone"; #define DEBUG_TYPE "irce" @@ -1045,11 +1050,23 @@ return Result; } +/// If the type of \p S matches with \p Ty, return \p S. Otherwise, return +/// signed or unsigned extension of \p S to type \p Ty. +static const SCEV *NoopOrExtend(const SCEV *S, Type *Ty, ScalarEvolution &SE, + bool Signed) { + return Signed ? SE.getNoopOrSignExtend(S, Ty) : SE.getNoopOrZeroExtend(S, Ty); +} + Optional LoopConstrainer::calculateSubRanges(bool IsSignedPredicate) const { IntegerType *Ty = cast(LatchTakenCount->getType()); - if (Range.getType() != Ty) + auto *RTy = cast(Range.getType()); + + // We only support wide range checks and narrow latches. + if (!AllowNarrowLatchCondition && RTy != Ty) + return None; + if (RTy->getBitWidth() < Ty->getBitWidth()) return None; LoopConstrainer::SubRanges Result; @@ -1057,8 +1074,10 @@ // I think we can be more aggressive here and make this nuw / nsw if the // addition that feeds into the icmp for the latch's terminating branch is nuw // / nsw. In any case, a wrapping 2's complement addition is safe. - const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart); - const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt); + const SCEV *Start = NoopOrExtend(SE.getSCEV(MainLoopStructure.IndVarStart), + RTy, SE, IsSignedPredicate); + const SCEV *End = NoopOrExtend(SE.getSCEV(MainLoopStructure.LoopExitAt), RTy, + SE, IsSignedPredicate); bool Increasing = MainLoopStructure.IndVarIncreasing; @@ -1068,7 +1087,7 @@ const SCEV *Smallest = nullptr, *Greatest = nullptr, *GreatestSeen = nullptr; - const SCEV *One = SE.getOne(Ty); + const SCEV *One = SE.getOne(RTy); if (Increasing) { Smallest = Start; Greatest = End; @@ -1257,6 +1276,13 @@ bool IsSignedPredicate = LS.IsSignedPredicate; IRBuilder<> B(PreheaderJump); + auto *RangeTy = Range.getBegin()->getType(); + auto NoopOrExt = [&](Value *V) { + if (V->getType() == RangeTy) + return V; + return IsSignedPredicate ? B.CreateSExt(V, RangeTy, "wide.start") : + B.CreateZExt(V, RangeTy, "wide.start"); + }; // EnterLoopCond - is it okay to start executing this `LS'? Value *EnterLoopCond = nullptr; @@ -1264,9 +1290,7 @@ Increasing ? (IsSignedPredicate ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT) : (IsSignedPredicate ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT); - Value *IndVarStart = LS.IndVarStart; - Value *IndVarBase = LS.IndVarBase; - Value *LoopExitAt = LS.LoopExitAt; + Value *IndVarStart = NoopOrExt(LS.IndVarStart); EnterLoopCond = B.CreateICmp(Pred, IndVarStart, ExitSubloopAt); B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit); @@ -1274,6 +1298,7 @@ LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector); B.SetInsertPoint(LS.LatchBr); + Value *IndVarBase = NoopOrExt(LS.IndVarBase); Value *TakeBackedgeLoopCond = B.CreateICmp(Pred, IndVarBase, ExitSubloopAt); Value *CondForBranch = LS.LatchBrExitIdx == 1 @@ -1287,6 +1312,7 @@ // IterationsLeft - are there any more iterations left, given the original // upper bound on the induction variable? If not, we branch to the "real" // exit. + Value *LoopExitAt = NoopOrExt(LS.LoopExitAt); Value *IterationsLeft = B.CreateICmp(Pred, IndVarBase, LoopExitAt); B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit); @@ -1395,7 +1421,7 @@ SubRanges SR = MaybeSR.getValue(); bool Increasing = MainLoopStructure.IndVarIncreasing; IntegerType *IVTy = - cast(MainLoopStructure.IndVarBase->getType()); + cast(Range.getBegin()->getType()); SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce"); Instruction *InsertPt = OriginalPreheader->getTerminator(); @@ -1558,6 +1584,12 @@ InductiveRangeCheck::computeSafeIterationSpace( ScalarEvolution &SE, const SCEVAddRecExpr *IndVar, bool IsLatchSigned) const { + // We can deal when types of latch check and range checks don't match in case + // if latch check is more narrow. + auto *IVType = cast(IndVar->getType()); + auto *RCType = cast(getBegin()->getType()); + if (IVType->getBitWidth() > RCType->getBitWidth()) + return None; // IndVar is of the form "A + B * I" (where "I" is the canonical induction // variable, that may or may not exist as a real llvm::Value in the loop) and // this inductive range check is a range check on the "C + D * I" ("C" is @@ -1581,8 +1613,9 @@ if (!IndVar->isAffine()) return None; - const SCEV *A = IndVar->getStart(); - const SCEVConstant *B = dyn_cast(IndVar->getStepRecurrence(SE)); + const SCEV *A = NoopOrExtend(IndVar->getStart(), RCType, SE, IsLatchSigned); + const SCEVConstant *B = dyn_cast( + NoopOrExtend(IndVar->getStepRecurrence(SE), RCType, SE, IsLatchSigned)); if (!B) return None; assert(!B->isZero() && "Recurrence with zero step?"); @@ -1593,7 +1626,7 @@ return None; assert(!D->getValue()->isZero() && "Recurrence with zero step?"); - unsigned BitWidth = cast(IndVar->getType())->getBitWidth(); + unsigned BitWidth = RCType->getBitWidth(); const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth)); // Subtract Y from X so that it does not go through border of the IV Index: test/Transforms/IRCE/wide_indvar.ll =================================================================== --- /dev/null +++ test/Transforms/IRCE/wide_indvar.ll @@ -0,0 +1,459 @@ +; RUN: opt -verify-loop-info -irce-print-changed-loops -irce -S < %s 2>&1 | FileCheck %s +; RUN: opt -verify-loop-info -irce-print-changed-loops -passes='require,loop(irce)' -S < %s 2>&1 | FileCheck %s + +; Check that we can remove trivially non-failing range check. +define i32 @test_increasing_slt_slt_wide_simple_no_postloop() { + +; CHECK-LABEL: @test_increasing_slt_slt_wide_simple_no_postloop( +; CHECK-NOT: preloop +; CHECK-NOT: postloop +; CHECK: loop: +; CHECK: br i1 true, label %backedge, label %check_failed + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %backedge ] + %rc = icmp slt i64 %iv, 100 + br i1 %rc, label %backedge, label %check_failed + +backedge: + %iv.next = add i64 %iv, 1 + %narrow.iv = trunc i64 %iv.next to i32 + %latch.cond = icmp slt i32 %narrow.iv, 100 + br i1 %latch.cond, label %loop, label %exit + +exit: + ret i32 %narrow.iv + +check_failed: + ret i32 -1 +} + +; This range check fails on the last iteration, so it needs a postloop. +define i32 @test_increasing_slt_slt_wide_simple_postloop() { + +; CHECK-LABEL: @test_increasing_slt_slt_wide_simple_postloop( +; CHECK-NOT: preloop +; CHECK: loop: +; CHECK: br i1 true, label %backedge, label %check_failed +; CHECK: backedge +; CHECK: [[COND:%[^ ]+]] = icmp slt i64 %wide.start, 99 +; CHECK: br i1 [[COND]], label %loop, label %main.exit.selector +; CHECK: postloop + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %backedge ] + %rc = icmp slt i64 %iv, 99 + br i1 %rc, label %backedge, label %check_failed + +backedge: + %iv.next = add i64 %iv, 1 + %narrow.iv = trunc i64 %iv.next to i32 + %latch.cond = icmp slt i32 %narrow.iv, 100 + br i1 %latch.cond, label %loop, label %exit + +exit: + ret i32 %narrow.iv + +check_failed: + ret i32 -1 +} + +; General case. If both %N and %M are non-negative, we do not need a preloop. +define i32 @test_increasing_slt_slt_wide_non-negative(i32* %n_ptr, i64* %m_ptr) { + +; CHECK-LABEL: @test_increasing_slt_slt_wide_non-negative( +; CHECK-NOT: preloop +; CHECK: loop: +; CHECK: br i1 true, label %backedge, label %check_failed +; CHECK: backedge +; CHECK: [[COND:%[^ ]+]] = icmp slt i64 %wide.start, %exit.mainloop.at +; CHECK: br i1 [[COND]], label %loop, label %main.exit.selector +; CHECK: postloop + +entry: + %N = load i32, i32* %n_ptr, !range !2 + %M = load i64, i64* %m_ptr, !range !1 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %backedge ] + %rc = icmp slt i64 %iv, %M + br i1 %rc, label %backedge, label %check_failed + +backedge: + %iv.next = add i64 %iv, 1 + %narrow.iv = trunc i64 %iv.next to i32 + %latch.cond = icmp slt i32 %narrow.iv, %N + br i1 %latch.cond, label %loop, label %exit + +exit: + ret i32 %narrow.iv + +check_failed: + ret i32 -1 +} + +; General case. Even though %M may be negative, we do not need a preloop because +; we make a non-negativity runtime check against M and do not go to main loop if +; M was negative. +define i32 @test_increasing_slt_slt_wide_general(i32* %n_ptr, i64* %m_ptr) { + +; CHECK-LABEL: @test_increasing_slt_slt_wide_general( +; CHECK-NOT: preloop +; CHECK: loop: +; CHECK: br i1 true, label %backedge, label %check_failed +; CHECK: backedge +; CHECK: [[COND:%[^ ]+]] = icmp slt i64 +; CHECK: br i1 [[COND]], label %loop, label %main.exit.selector +; CHECK: postloop + +entry: + %N = load i32, i32* %n_ptr, !range !2 + %M = load i64, i64* %m_ptr + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %backedge ] + %rc = icmp slt i64 %iv, %M + br i1 %rc, label %backedge, label %check_failed + +backedge: + %iv.next = add i64 %iv, 1 + %narrow.iv = trunc i64 %iv.next to i32 + %latch.cond = icmp slt i32 %narrow.iv, %N + br i1 %latch.cond, label %loop, label %exit + +exit: + ret i32 %narrow.iv + +check_failed: + ret i32 -1 +} + +; General case with preloop. +define i32 @test_increasing_slt_slt_wide_general_preloop(i32* %n_ptr, i64* %m_ptr) { + +; CHECK-LABEL: @test_increasing_slt_slt_wide_general_preloop( +; CHECK: loop: +; CHECK: br i1 true, label %backedge, label %check_failed +; CHECK: backedge +; CHECK: [[COND:%[^ ]+]] = icmp slt i64 +; CHECK: br i1 [[COND]], label %loop, label %main.exit.selector +; CHECK: preloop +; CHECK: postloop + +entry: + %N = load i32, i32* %n_ptr, !range !2 + %M = load i64, i64* %m_ptr + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %backedge ] + %rc = icmp slt i64 %iv, %M + br i1 %rc, label %backedge, label %check_failed + +backedge: + %iv.next = add i64 %iv, 1 + %narrow.iv = trunc i64 %iv to i32 + %latch.cond = icmp slt i32 %narrow.iv, %N + br i1 %latch.cond, label %loop, label %exit + +exit: + ret i32 %narrow.iv + +check_failed: + ret i32 -1 +} + +; Same as above, multiple checks. +define i32 @test_increasing_slt_slt_wide_multiple_checks(i32* %n_ptr, i64* %m1_ptr, i64* %m2_ptr, i64* %m3_ptr, i64* %m4_ptr) { +; CHECK-LABEL: @test_increasing_slt_slt_wide_multiple_checks( +; CHECK-NOT: preloop +; CHECK: loop: +; CHECK: %c1 = and i1 true, true +; CHECK: %c2 = and i1 %c1, true +; CHECK: %rc = and i1 %c2, true +; CHECK: br i1 %rc, label %backedge, label %check_failed.loopexit +; CHECK: backedge +; CHECK: [[COND:%[^ ]+]] = icmp slt i64 +; CHECK: br i1 [[COND]], label %loop, label %main.exit.selector +; CHECK: postloop + +entry: + %N = load i32, i32* %n_ptr, !range !2 + %M1 = load i64, i64* %m1_ptr + %M2 = load i64, i64* %m2_ptr + %M3 = load i64, i64* %m3_ptr + %M4 = load i64, i64* %m4_ptr + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %backedge ] + %rc1 = icmp slt i64 %iv, %M1 + %rc2 = icmp slt i64 %iv, %M2 + %rc3 = icmp slt i64 %iv, %M3 + %rc4 = icmp slt i64 %iv, %M4 + %c1 = and i1 %rc1, %rc2 + %c2 = and i1 %c1, %rc3 + %rc = and i1 %c2, %rc4 + br i1 %rc, label %backedge, label %check_failed + +backedge: + %iv.next = add i64 %iv, 1 + %narrow.iv = trunc i64 %iv.next to i32 + %latch.cond = icmp slt i32 %narrow.iv, %N + br i1 %latch.cond, label %loop, label %exit + +exit: + ret i32 %narrow.iv + +check_failed: + ret i32 -1 +} + +; Wide IV against narrow range check. We don't currently support it. +define i32 @test_increasing_slt_slt_wide_simple_negtest_narrow_rc() { + +; CHECK-LABEL: @test_increasing_slt_slt_wide_simple_negtest_narrow_rc( +; CHECK-NOT: i1 true +; CHECK-NOT: main + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %backedge ] + %narrow.iv = trunc i64 %iv to i32 + %rc = icmp slt i32 %narrow.iv, 101 + br i1 %rc, label %backedge, label %check_failed + +backedge: + %iv.next = add i64 %iv, 1 + %latch.cond = icmp slt i64 %iv, 100 + br i1 %latch.cond, label %loop, label %exit + +exit: + ret i32 %narrow.iv + +check_failed: + ret i32 -1 +} + +; Check that we can remove trivially non-failing range check. +define i32 @test_increasing_ult_ult_wide_simple_no_postloop() { + +; CHECK-LABEL: @test_increasing_ult_ult_wide_simple_no_postloop( +; CHECK-NOT: preloop +; CHECK-NOT: postloop +; CHECK: loop: +; CHECK: br i1 true, label %backedge, label %check_failed + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %backedge ] + %rc = icmp ult i64 %iv, 100 + br i1 %rc, label %backedge, label %check_failed + +backedge: + %iv.next = add i64 %iv, 1 + %narrow.iv = trunc i64 %iv.next to i32 + %latch.cond = icmp ult i32 %narrow.iv, 100 + br i1 %latch.cond, label %loop, label %exit + +exit: + ret i32 %narrow.iv + +check_failed: + ret i32 -1 +} + +; This range check fails on the last iteration, so it needs a postloop. +define i32 @test_increasing_ult_ult_wide_simple_postloop() { + +; CHECK-LABEL: @test_increasing_ult_ult_wide_simple_postloop( +; CHECK-NOT: preloop +; CHECK: loop: +; CHECK: br i1 true, label %backedge, label %check_failed +; CHECK: backedge +; CHECK: [[COND:%[^ ]+]] = icmp ult i64 %wide.start, 99 +; CHECK: br i1 [[COND]], label %loop, label %main.exit.selector +; CHECK: postloop + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %backedge ] + %rc = icmp ult i64 %iv, 99 + br i1 %rc, label %backedge, label %check_failed + +backedge: + %iv.next = add i64 %iv, 1 + %narrow.iv = trunc i64 %iv.next to i32 + %latch.cond = icmp ult i32 %narrow.iv, 100 + br i1 %latch.cond, label %loop, label %exit + +exit: + ret i32 %narrow.iv + +check_failed: + ret i32 -1 +} + +; General case. If both %N and %M are non-negative, we do not need a preloop. +define i32 @test_increasing_ult_ult_wide_non-negative(i32* %n_ptr, i64* %m_ptr) { + +; CHECK-LABEL: @test_increasing_ult_ult_wide_non-negative( +; CHECK-NOT: preloop +; CHECK: loop: +; CHECK: br i1 true, label %backedge, label %check_failed +; CHECK: backedge +; CHECK: [[COND:%[^ ]+]] = icmp ult i64 %wide.start, %exit.mainloop.at +; CHECK: br i1 [[COND]], label %loop, label %main.exit.selector +; CHECK: postloop + +entry: + %N = load i32, i32* %n_ptr, !range !2 + %M = load i64, i64* %m_ptr, !range !1 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %backedge ] + %rc = icmp ult i64 %iv, %M + br i1 %rc, label %backedge, label %check_failed + +backedge: + %iv.next = add i64 %iv, 1 + %narrow.iv = trunc i64 %iv.next to i32 + %latch.cond = icmp ult i32 %narrow.iv, %N + br i1 %latch.cond, label %loop, label %exit + +exit: + ret i32 %narrow.iv + +check_failed: + ret i32 -1 +} + +; General case. Even though %M may be negative, we do not need a preloop because +; we make a non-negativity runtime check against M and do not go to main loop if +; M was negative. +define i32 @test_increasing_ult_ult_wide_general(i32* %n_ptr, i64* %m_ptr) { + +; CHECK-LABEL: @test_increasing_ult_ult_wide_general( +; CHECK-NOT: preloop +; CHECK: loop: +; CHECK: br i1 true, label %backedge, label %check_failed +; CHECK: backedge +; CHECK: [[COND:%[^ ]+]] = icmp ult i64 +; CHECK: br i1 [[COND]], label %loop, label %main.exit.selector +; CHECK: postloop + +entry: + %N = load i32, i32* %n_ptr, !range !2 + %M = load i64, i64* %m_ptr + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %backedge ] + %rc = icmp ult i64 %iv, %M + br i1 %rc, label %backedge, label %check_failed + +backedge: + %iv.next = add i64 %iv, 1 + %narrow.iv = trunc i64 %iv.next to i32 + %latch.cond = icmp ult i32 %narrow.iv, %N + br i1 %latch.cond, label %loop, label %exit + +exit: + ret i32 %narrow.iv + +check_failed: + ret i32 -1 +} + +; Same as above, multiple checks. +define i32 @test_increasing_ult_ult_wide_multiple_checks(i32* %n_ptr, i64* %m1_ptr, i64* %m2_ptr, i64* %m3_ptr, i64* %m4_ptr) { +; CHECK-LABEL: @test_increasing_ult_ult_wide_multiple_checks( +; CHECK-NOT: preloop +; CHECK: loop: +; CHECK: %c1 = and i1 true, true +; CHECK: %c2 = and i1 %c1, true +; CHECK: %rc = and i1 %c2, true +; CHECK: br i1 %rc, label %backedge, label %check_failed.loopexit +; CHECK: backedge +; CHECK: [[COND:%[^ ]+]] = icmp ult i64 +; CHECK: br i1 [[COND]], label %loop, label %main.exit.selector +; CHECK: postloop + +entry: + %N = load i32, i32* %n_ptr, !range !2 + %M1 = load i64, i64* %m1_ptr + %M2 = load i64, i64* %m2_ptr + %M3 = load i64, i64* %m3_ptr + %M4 = load i64, i64* %m4_ptr + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %backedge ] + %rc1 = icmp ult i64 %iv, %M1 + %rc2 = icmp ult i64 %iv, %M2 + %rc3 = icmp ult i64 %iv, %M3 + %rc4 = icmp ult i64 %iv, %M4 + %c1 = and i1 %rc1, %rc2 + %c2 = and i1 %c1, %rc3 + %rc = and i1 %c2, %rc4 + br i1 %rc, label %backedge, label %check_failed + +backedge: + %iv.next = add i64 %iv, 1 + %narrow.iv = trunc i64 %iv.next to i32 + %latch.cond = icmp ult i32 %narrow.iv, %N + br i1 %latch.cond, label %loop, label %exit + +exit: + ret i32 %narrow.iv + +check_failed: + ret i32 -1 +} + +; Wide IV against narrow range check. We don't currently support it. +define i32 @test_increasing_ult_ult_wide_simple_negtest_narrow_rc() { + +; CHECK-LABEL: @test_increasing_ult_ult_wide_simple_negtest_narrow_rc( +; CHECK-NOT: i1 true +; CHECK-NOT: main + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %backedge ] + %narrow.iv = trunc i64 %iv to i32 + %rc = icmp ult i32 %narrow.iv, 101 + br i1 %rc, label %backedge, label %check_failed + +backedge: + %iv.next = add i64 %iv, 1 + %latch.cond = icmp ult i64 %iv, 100 + br i1 %latch.cond, label %loop, label %exit + +exit: + ret i32 %narrow.iv + +check_failed: + ret i32 -1 +} + +!0 = !{i32 0, i32 2147483647} +!1 = !{i64 0, i64 9223372036854775807} +!2 = !{i32 1, i32 2147483647}