diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1556,25 +1556,22 @@ // For some CPUs result of CTLZ(X) intrinsic is undefined // when X is 0. If we can not guarantee X != 0, we need to check this // when expand. - bool ZeroCheck = false; + bool FoundZeroCheck = false; // It is safe to assume Preheader exist as it was checked in // parent function RunOnLoop. BasicBlock *PH = CurLoop->getLoopPreheader(); - // If we are using the count instruction outside the loop, make sure we - // have a zero check as a precondition. Without the check the loop would run - // one iteration for before any check of the input value. This means 0 and 1 - // would have identical behavior in the original loop and thus + // If we are using the count instruction outside the loop, see if we have a + // zero check as a precondition. Without the check the loop would run one + // iteration before any check of the input value. This means an input of 0 + // or 1 would produce the same count result. If were aren't able to find + // the zero check, we'll need to insert our own to select a different result + // for an input of 0. if (!IsCntPhiUsedOutsideLoop) { - auto *PreCondBB = PH->getSinglePredecessor(); - if (!PreCondBB) - return false; - auto *PreCondBI = dyn_cast(PreCondBB->getTerminator()); - if (!PreCondBI) - return false; - if (matchCondition(PreCondBI, PH) != InitX) - return false; - ZeroCheck = true; + if (auto *PreCondBB = PH->getSinglePredecessor()) + if (auto *PreCondBI = dyn_cast(PreCondBB->getTerminator())) + if (matchCondition(PreCondBI, PH) == InitX) + FoundZeroCheck = true; } // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always @@ -1588,9 +1585,8 @@ // %inc = add nsw %i.0, 1 // br i1 %tobool - const Value *Args[] = { - InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext()) - : ConstantInt::getFalse(InitX->getContext())}; + const Value *Args[] = {InitX, ConstantInt::getBool(InitX->getContext(), + !IsCntPhiUsedOutsideLoop)}; // @llvm.dbg doesn't count as they have no semantic effect. auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug(); @@ -1605,7 +1601,7 @@ return false; transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX, - DefX->getDebugLoc(), ZeroCheck, + DefX->getDebugLoc(), FoundZeroCheck, IsCntPhiUsedOutsideLoop); return true; } @@ -1674,9 +1670,9 @@ } static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, - const DebugLoc &DL, bool ZeroCheck, + const DebugLoc &DL, bool ZeroUndef, Intrinsic::ID IID) { - Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()}; + Value *Ops[] = {Val, ZeroUndef ? IRBuilder.getTrue() : IRBuilder.getFalse()}; Type *Tys[] = {Val->getType()}; Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); @@ -1721,7 +1717,7 @@ void LoopIdiomRecognize::transformLoopToCountable( Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL, - bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { + bool FoundZeroCheck, bool IsCntPhiUsedOutsideLoop) { BranchInst *PreheaderBr = cast(Preheader->getTerminator()); // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block @@ -1748,7 +1744,10 @@ llvm_unreachable("Unexpected opcode!"); } else InitXNext = InitX; - Value *FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID); + // If the count phi is not used outside the loop, we should use the zero + // undef form. If we didn't find a zero check we'll insert our own later. + Value *FFS = createFFSIntrinsic(Builder, InitXNext, DL, + !IsCntPhiUsedOutsideLoop, IntrinID); Value *Count = Builder.CreateSub( ConstantInt::get(FFS->getType(), FFS->getType()->getIntegerBitWidth()), FFS); @@ -1774,6 +1773,21 @@ NewCount = Builder.CreateSub(CntInitVal, NewCount); } + // If the count instruction is used outside the loop and we didn't find a zero + // check, we need to insert one. It's possible the zero check exists, but was + // missed by our simple check. Later passes may be able to optimize out this + // new check. + if (!IsCntPhiUsedOutsideLoop && !FoundZeroCheck) { + // If we didn't find the zero check, we need to assume we'll execute one + // iteration of the loop when the input is zero. If the input is zero, we + // need to select the initial value incremented/decremented by the count + // instruction's immediate operand. + Value *Cmp = Builder.CreateICmpEQ( + InitXNext, ConstantInt::get(InitXNext->getType(), 0)); + Value *AltCount = Builder.CreateAdd(CntInitVal, CntInst->getOperand(1)); + NewCount = Builder.CreateSelect(Cmp, AltCount, NewCount); + } + // Step 2: Insert new IV and loop condition: // loop: // ... diff --git a/llvm/test/Transforms/LoopIdiom/X86/ctlz.ll b/llvm/test/Transforms/LoopIdiom/X86/ctlz.ll --- a/llvm/test/Transforms/LoopIdiom/X86/ctlz.ll +++ b/llvm/test/Transforms/LoopIdiom/X86/ctlz.ll @@ -256,6 +256,86 @@ ; Recognize CTLZ builtin pattern. ; Here it will replace the loop - ; assume builtin is always profitable. +; The C code here represents how other loop transformations and instcombine +; may optimize the earlier ctlz_zero_check test case. The precondition check +; for 0 is using the input to the absolute value pattern instead of the output. +; This makes it different than the Value* passed to the loop phi. So we won't +; be able to tell the input was checked for 0 so our replacement will need to +; account for this. +; +; int ctlz_zero_check_elided_abs(int n) +; { +; int abs_n = n >= 0 ? n : -n; +; if (!n) +; return 0; +; int i = 0; +; do { +; abs_n >>= 1; +; i++; +; } while (abs_n); +; return i; +; } +; +define i32 @ctlz_zero_check_elided_abs(i32 %n) { +; ALL-LABEL: @ctlz_zero_check_elided_abs( +; ALL-NEXT: entry: +; ALL-NEXT: [[C:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; ALL-NEXT: [[NEGN:%.*]] = sub nsw i32 0, [[N]] +; ALL-NEXT: [[ABS_N:%.*]] = select i1 [[C]], i32 [[N]], i32 [[NEGN]] +; ALL-NEXT: [[TOBOOL4:%.*]] = icmp eq i32 [[N]], 0 +; ALL-NEXT: br i1 [[TOBOOL4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] +; ALL: while.body.preheader: +; ALL-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[ABS_N]], i1 true) +; ALL-NEXT: [[TMP1:%.*]] = sub i32 32, [[TMP0]] +; ALL-NEXT: [[TMP2:%.*]] = icmp eq i32 [[ABS_N]], 0 +; ALL-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 1, i32 [[TMP1]] +; ALL-NEXT: br label [[WHILE_BODY:%.*]] +; ALL: while.body: +; ALL-NEXT: [[TCPHI:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ] +; ALL-NEXT: [[I_06:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] +; ALL-NEXT: [[N_ADDR_05:%.*]] = phi i32 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[ABS_N]], [[WHILE_BODY_PREHEADER]] ] +; ALL-NEXT: [[SHR]] = lshr i32 [[N_ADDR_05]], 1 +; ALL-NEXT: [[INC]] = add nsw i32 [[I_06]], 1 +; ALL-NEXT: [[TCDEC]] = sub nsw i32 [[TCPHI]], 1 +; ALL-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0 +; ALL-NEXT: br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] +; ALL: while.end.loopexit: +; ALL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[WHILE_BODY]] ] +; ALL-NEXT: br label [[WHILE_END]] +; ALL: while.end: +; ALL-NEXT: [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ] +; ALL-NEXT: ret i32 [[I_0_LCSSA]] +; +entry: + %c = icmp sgt i32 %n, 0 + %negn = sub nsw i32 0, %n + %abs_n = select i1 %c, i32 %n, i32 %negn + %tobool4 = icmp eq i32 %n, 0 + br i1 %tobool4, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %n.addr.05 = phi i32 [ %shr, %while.body ], [ %abs_n, %while.body.preheader ] + %shr = lshr i32 %n.addr.05, 1 + %inc = add nsw i32 %i.06, 1 + %tobool = icmp eq i32 %shr, 0 + br i1 %tobool, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ] + ret i32 %i.0.lcssa +} + + +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. ; ; int ctlz(int n) ; { @@ -630,10 +710,11 @@ ret i32 %cnt.0.lcssa } -; We can't easily transform this loop. It returns 1 for an input of both -; 0 and 1. +; This loop has no check for n being 0 before entering the loop so we need to +; make one in order to use ctlz since an input of 0 or 1 will produce the same +; count. ; -; int ctlz_bad(unsigned n) +; int ctlz_no_zero_check(unsigned n) ; { ; int i = 0; ; do { @@ -643,19 +724,25 @@ ; return i; ; } ; -define i32 @ctlz_bad(i32 %n) { -; ALL-LABEL: @ctlz_bad( +define i32 @ctlz_no_zero_check(i32 %n) { +; ALL-LABEL: @ctlz_no_zero_check( ; ALL-NEXT: entry: +; ALL-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[N:%.*]], i1 true) +; ALL-NEXT: [[TMP1:%.*]] = sub i32 32, [[TMP0]] +; ALL-NEXT: [[TMP2:%.*]] = icmp eq i32 [[N]], 0 +; ALL-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 1, i32 [[TMP1]] ; ALL-NEXT: br label [[WHILE_COND:%.*]] ; ALL: while.cond: -; ALL-NEXT: [[N_ADDR_0:%.*]] = phi i32 [ [[N:%.*]], [[ENTRY:%.*]] ], [ [[SHR:%.*]], [[WHILE_COND]] ] +; ALL-NEXT: [[TCPHI:%.*]] = phi i32 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[WHILE_COND]] ] +; ALL-NEXT: [[N_ADDR_0:%.*]] = phi i32 [ [[N]], [[ENTRY]] ], [ [[SHR:%.*]], [[WHILE_COND]] ] ; ALL-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[WHILE_COND]] ] ; ALL-NEXT: [[SHR]] = lshr i32 [[N_ADDR_0]], 1 -; ALL-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[SHR]], 0 +; ALL-NEXT: [[TCDEC]] = sub nsw i32 [[TCPHI]], 1 +; ALL-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0 ; ALL-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 ; ALL-NEXT: br i1 [[TOBOOL]], label [[WHILE_END:%.*]], label [[WHILE_COND]] ; ALL: while.end: -; ALL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_COND]] ] +; ALL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[WHILE_COND]] ] ; ALL-NEXT: ret i32 [[INC_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll b/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll @@ -6,7 +6,6 @@ ; This test ensures we are able to optimize the following loop to an llvm.abs ; followed by an llvm.ctlz. -; FIXME: LoopIdiom recongize is not forming llvm.ctlz. ; int ctlz_zero_check(int n) ; { @@ -19,23 +18,15 @@ ; return i; ; } +; FIXME: The icmp and select in the output are unnecessary. define i32 @ctlz_loop_with_abs(i32 %n) { ; CHECK-LABEL: @ctlz_loop_with_abs( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TOBOOL_NOT1:%.*]] = icmp eq i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[TOBOOL_NOT1]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] -; CHECK: while.body.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true) -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[N_ADDR_03:%.*]] = phi i32 [ [[TMP1:%.*]], [[WHILE_BODY]] ], [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[TMP1]] = lshr i32 [[N_ADDR_03]], 1 -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_02]], 1 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END]], label [[WHILE_BODY]] -; CHECK: while.end: -; CHECK-NEXT: [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 true), !range [[RNG0:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = sub nuw nsw i32 32, [[TMP1]] +; CHECK-NEXT: [[I_0_LCSSA:%.*]] = select i1 [[TOBOOL_NOT1]], i32 0, i32 [[TMP2]] ; CHECK-NEXT: ret i32 [[I_0_LCSSA]] ; entry: