Index: lib/Transforms/Scalar/LoopIdiomRecognize.cpp =================================================================== --- lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -188,8 +188,9 @@ PHINode *CntPhi, Value *Var); bool recognizeAndInsertCTLZ(); void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst, - PHINode *CntPhi, Value *Var, const DebugLoc DL, - bool ZeroCheck, bool IsCntPhiUsedOutsideLoop); + PHINode *CntPhi, Value *Var, Instruction *DefX, + const DebugLoc DL, bool ZeroCheck, + bool IsCntPhiUsedOutsideLoop); /// @} }; @@ -1316,8 +1317,8 @@ return false; // step 2: detect instructions corresponding to "x.next = x >> 1" - // TODO: Support loops that use LShr. - if (!DefX || DefX->getOpcode() != Instruction::AShr) + if (!DefX || (DefX->getOpcode() != Instruction::AShr && + DefX->getOpcode() != Instruction::LShr)) return false; ConstantInt *Shft = dyn_cast(DefX->getOperand(1)); if (!Shft || !Shft->isOne()) @@ -1401,8 +1402,7 @@ // Make sure the initial value can't be negative otherwise the ashr in the // loop might never reach zero which would make the loop infinite. - // TODO: Support loops that use lshr and wouldn't need this check. - if (!isKnownNonNegative(InitX, *DL)) + if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, *DL)) return false; // If we check X != 0 before entering the loop we don't need a zero @@ -1434,7 +1434,7 @@ return false; const DebugLoc DL = DefX->getDebugLoc(); - transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck, + transformLoopToCountable(PH, CntInst, CntPhi, InitX, DefX, DL, ZeroCheck, IsCntPhiUsedOutsideLoop); return true; } @@ -1548,7 +1548,8 @@ /// If CntInst and DefX are not used in LOOP_BODY they will be removed. void LoopIdiomRecognize::transformLoopToCountable( BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX, - const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { + Instruction *DefX, const DebugLoc DL, bool ZeroCheck, + bool IsCntPhiUsedOutsideLoop) { BranchInst *PreheaderBr = cast(Preheader->getTerminator()); // Step 1: Insert the CTLZ instruction at the end of the preheader block @@ -1559,10 +1560,14 @@ Builder.SetCurrentDebugLocation(DL); Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext; - if (IsCntPhiUsedOutsideLoop) - InitXNext = Builder.CreateAShr(InitX, - ConstantInt::get(InitX->getType(), 1)); - else + if (IsCntPhiUsedOutsideLoop) { + if (DefX->getOpcode() == Instruction::AShr) + InitXNext = + Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1)); + else if (DefX->getOpcode() == Instruction::LShr) + InitXNext = + Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1)); + } else InitXNext = InitX; CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck); Count = Builder.CreateSub( Index: test/Transforms/LoopIdiom/X86/ctlz.ll =================================================================== --- test/Transforms/LoopIdiom/X86/ctlz.ll +++ test/Transforms/LoopIdiom/X86/ctlz.ll @@ -115,6 +115,52 @@ ret i32 %i.0.lcssa } +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz_zero_check_lshr(int n) +; { +; int i = 0; +; while(n) { +; n >>= 1; +; i++; +; } +; return i; +; } +; +; ALL: entry +; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %n, i1 true) +; ALL-NEXT: %1 = sub i32 32, %0 +; ALL: %inc.lcssa = phi i32 [ %1, %while.body ] +; ALL: %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ] +; ALL: ret i32 %i.0.lcssa + +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @ctlz_zero_check_lshr(i32 %n) { +entry: + %tobool4 = icmp eq i32 %n, 0 + br i1 %tobool4, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %n.addr.05 = phi i32 [ %shr, %while.body ], [ %n, %while.body.preheader ] + %shr = lshr i32 %n.addr.05, 1 + %inc = add nsw i32 %i.06, 1 + %tobool = icmp eq i32 %shr, 0 + br i1 %tobool, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ] + ret i32 %i.0.lcssa +} + ; Recognize CTLZ builtin pattern. ; Here it will replace the loop - ; assume builtin is always profitable. @@ -157,6 +203,44 @@ ret i32 %i.0 } +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz_lshr(int n) +; { +; int i = 0; +; while(n >>= 1) { +; i++; +; } +; return i; +; } +; +; ALL: entry +; ALL: %0 = lshr i32 %n, 1 +; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) +; ALL-NEXT: %2 = sub i32 32, %1 +; ALL-NEXT: %3 = add i32 %2, 1 +; ALL: %i.0.lcssa = phi i32 [ %2, %while.cond ] +; ALL: ret i32 %i.0.lcssa + +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @ctlz_lshr(i32 %n) { +entry: + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ] + %shr = lshr i32 %n.addr.0, 1 + %tobool = icmp eq i32 %shr, 0 + %inc = add nsw i32 %i.0, 1 + br i1 %tobool, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret i32 %i.0 +} + ; Recognize CTLZ builtin pattern. ; Here it will replace the loop - ; assume builtin is always profitable. @@ -200,6 +284,45 @@ ret i32 %i.0 } +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz_add_lshr(int n, int i0) +; { +; int i = i0; +; while(n >>= 1) { +; i++; +; } +; return i; +; } +; +; ALL: entry +; ALL: %0 = lshr i32 %n, 1 +; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) +; ALL-NEXT: %2 = sub i32 32, %1 +; ALL-NEXT: %3 = add i32 %2, 1 +; ALL-NEXT: %4 = add i32 %2, %i0 +; ALL: %i.0.lcssa = phi i32 [ %4, %while.cond ] +; ALL: ret i32 %i.0.lcssa +; +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @ctlz_add_lshr(i32 %n, i32 %i0) { +entry: + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ] + %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ] + %shr = lshr i32 %n.addr.0, 1 + %tobool = icmp eq i32 %shr, 0 + %inc = add nsw i32 %i.0, 1 + br i1 %tobool, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret i32 %i.0 +} + ; Recognize CTLZ builtin pattern. ; Here it will replace the loop - ; assume builtin is always profitable. @@ -245,6 +368,45 @@ ret i32 %i.0 } +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz_sext_lshr(short in) +; { +; int i = 0; +; while(in >>= 1) { +; i++; +; } +; return i; +; } +; +; ALL: entry +; ALL: %0 = lshr i32 %n, 1 +; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) +; ALL-NEXT: %2 = sub i32 32, %1 +; ALL-NEXT: %3 = add i32 %2, 1 +; ALL: %i.0.lcssa = phi i32 [ %2, %while.cond ] +; ALL: ret i32 %i.0.lcssa + +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @ctlz_sext_lshr(i16 %in) { +entry: + %n = sext i16 %in to i32 + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ] + %shr = lshr i32 %n.addr.0, 1 + %tobool = icmp eq i32 %shr, 0 + %inc = add nsw i32 %i.0, 1 + br i1 %tobool, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret i32 %i.0 +} + ; This loop contains a volatile store. If x is initially negative, ; the code will be an infinite loop because the ashr will eventually produce ; all ones and continue doing so. This prevents the loop from terminating. If