Index: llvm/trunk/lib/Transforms/Scalar/LoopIdiomRecognize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ llvm/trunk/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -26,7 +26,7 @@ // Future floating point idioms to recognize in -ffast-math mode: // fpowi // Future integer operation idioms to recognize: -// ctpop, ctlz, cttz +// ctpop // // Beware that isel's default lowering for ctpop is highly inefficient for // i64 and larger types when i64 is legal and the value has few bits set. It @@ -187,9 +187,10 @@ bool recognizePopcount(); void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var); - bool recognizeAndInsertCTLZ(); - void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst, - PHINode *CntPhi, Value *Var, Instruction *DefX, + bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz + void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB, + Instruction *CntInst, PHINode *CntPhi, + Value *Var, Instruction *DefX, const DebugLoc &DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop); @@ -1108,15 +1109,17 @@ } bool LoopIdiomRecognize::runOnNoncountableLoop() { - return recognizePopcount() || recognizeAndInsertCTLZ(); + return recognizePopcount() || recognizeAndInsertFFS(); } /// Check if the given conditional branch is based on the comparison between -/// a variable and zero, and if the variable is non-zero, the control yields to -/// the loop entry. If the branch matches the behavior, the variable involved -/// in the comparison is returned. This function will be called to see if the -/// precondition and postcondition of the loop are in desirable form. -static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) { +/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is +/// true), the control yields to the loop entry. If the branch matches the +/// behavior, the variable involved in the comparison is returned. This function +/// will be called to see if the precondition and postcondition of the loop are +/// in desirable form. +static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry, + bool JmpOnZero = false) { if (!BI || !BI->isConditional()) return nullptr; @@ -1128,9 +1131,14 @@ if (!CmpZero || !CmpZero->isZero()) return nullptr; + BasicBlock *TrueSucc = BI->getSuccessor(0); + BasicBlock *FalseSucc = BI->getSuccessor(1); + if (JmpOnZero) + std::swap(TrueSucc, FalseSucc); + ICmpInst::Predicate Pred = Cond->getPredicate(); - if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) || - (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry)) + if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) || + (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry)) return Cond->getOperand(0); return nullptr; @@ -1306,14 +1314,14 @@ /// /// loop-exit: /// \endcode -static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, - Instruction *&CntInst, PHINode *&CntPhi, - Instruction *&DefX) { +static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, + Intrinsic::ID &IntrinID, Value *&InitX, + Instruction *&CntInst, PHINode *&CntPhi, + Instruction *&DefX) { BasicBlock *LoopEntry; Value *VarX = nullptr; DefX = nullptr; - PhiX = nullptr; CntInst = nullptr; CntPhi = nullptr; LoopEntry = *(CurLoop->block_begin()); @@ -1325,20 +1333,28 @@ else return false; - // step 2: detect instructions corresponding to "x.next = x >> 1" - if (!DefX || (DefX->getOpcode() != Instruction::AShr && - DefX->getOpcode() != Instruction::LShr)) + // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1" + if (!DefX || !DefX->isShift()) return false; + IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz : + Intrinsic::ctlz; ConstantInt *Shft = dyn_cast(DefX->getOperand(1)); if (!Shft || !Shft->isOne()) return false; VarX = DefX->getOperand(0); // step 3: Check the recurrence of variable X - PhiX = getRecurrenceVar(VarX, DefX, LoopEntry); + PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry); if (!PhiX) return false; + InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader()); + + // Make sure the initial value can't be negative otherwise the ashr in the + // loop might never reach zero which would make the loop infinite. + if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL)) + return false; + // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 // TODO: We can skip the step. If loop trip count is known (CTLZ), // then all uses of "cnt.next" could be optimized to the trip count @@ -1370,17 +1386,25 @@ return true; } -/// Recognize CTLZ idiom in a non-countable loop and convert the loop -/// to countable (with CTLZ trip count). -/// If CTLZ inserted as a new trip count returns true; otherwise, returns false. -bool LoopIdiomRecognize::recognizeAndInsertCTLZ() { +/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop +/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new +/// trip count returns true; otherwise, returns false. +bool LoopIdiomRecognize::recognizeAndInsertFFS() { // Give up if the loop has multiple blocks or multiple backedges. if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) return false; - Instruction *CntInst, *DefX; - PHINode *CntPhi, *PhiX; - if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX)) + Intrinsic::ID IntrinID; + Value *InitX; + Instruction *DefX = nullptr; + PHINode *CntPhi = nullptr; + Instruction *CntInst = nullptr; + // Help decide if transformation is profitable. For ShiftUntilZero idiom, + // this is always 6. + size_t IdiomCanonicalSize = 6; + + if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, + CntInst, CntPhi, DefX)) return false; bool IsCntPhiUsedOutsideLoop = false; @@ -1407,12 +1431,6 @@ // It is safe to assume Preheader exist as it was checked in // parent function RunOnLoop. BasicBlock *PH = CurLoop->getLoopPreheader(); - Value *InitX = PhiX->getIncomingValueForBlock(PH); - - // Make sure the initial value can't be negative otherwise the ashr in the - // loop might never reach zero which would make the loop infinite. - if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, *DL)) - return false; // If we are using the count instruction outside the loop, make sure we // have a zero check as a precondition. Without the check the loop would run @@ -1430,8 +1448,10 @@ ZeroCheck = true; } - // Check if CTLZ intrinsic is profitable. Assume it is always profitable - // if we delete the loop (the loop has only 6 instructions): + // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always + // profitable if we delete the loop. + + // the loop has only 6 instructions: // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] // %shr = ashr %n.addr.0, 1 @@ -1442,12 +1462,12 @@ const Value *Args[] = {InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext()) : ConstantInt::getFalse(InitX->getContext())}; - if (CurLoop->getHeader()->size() != 6 && - TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) > - TargetTransformInfo::TCC_Basic) + if (CurLoop->getHeader()->size() != IdiomCanonicalSize && + TTI->getIntrinsicCost(IntrinID, InitX->getType(), Args) > + TargetTransformInfo::TCC_Basic) return false; - transformLoopToCountable(PH, CntInst, CntPhi, InitX, DefX, + transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX, DefX->getDebugLoc(), ZeroCheck, IsCntPhiUsedOutsideLoop); return true; @@ -1516,20 +1536,21 @@ return CI; } -static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val, - const DebugLoc &DL, bool ZeroCheck) { +static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, + const DebugLoc &DL, bool ZeroCheck, + Intrinsic::ID IID) { Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()}; Type *Tys[] = {Val->getType()}; Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); - Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys); + Value *Func = Intrinsic::getDeclaration(M, IID, Tys); CallInst *CI = IRBuilder.CreateCall(Func, Ops); CI->setDebugLoc(DL); return CI; } -/// Transform the following loop: +/// Transform the following loop (Using CTLZ, CTTZ is similar): /// loop: /// CntPhi = PHI [Cnt0, CntInst] /// PhiX = PHI [InitX, DefX] @@ -1561,19 +1582,19 @@ /// If LOOP_BODY is empty the loop will be deleted. /// If CntInst and DefX are not used in LOOP_BODY they will be removed. void LoopIdiomRecognize::transformLoopToCountable( - BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX, - Instruction *DefX, const DebugLoc &DL, bool ZeroCheck, - bool IsCntPhiUsedOutsideLoop) { + Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst, + PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL, + bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { BranchInst *PreheaderBr = cast(Preheader->getTerminator()); - // Step 1: Insert the CTLZ instruction at the end of the preheader block - // Count = BitWidth - CTLZ(InitX); - // If there are uses of CntPhi create: - // CountPrev = BitWidth - CTLZ(InitX >> 1); + // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block IRBuilder<> Builder(PreheaderBr); Builder.SetCurrentDebugLocation(DL); - Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext; + Value *FFS, *Count, *CountPrev, *NewCount, *InitXNext; + // Count = BitWidth - CTLZ(InitX); + // If there are uses of CntPhi create: + // CountPrev = BitWidth - CTLZ(InitX >> 1); if (IsCntPhiUsedOutsideLoop) { if (DefX->getOpcode() == Instruction::AShr) InitXNext = @@ -1581,29 +1602,30 @@ else if (DefX->getOpcode() == Instruction::LShr) InitXNext = Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1)); + else if (DefX->getOpcode() == Instruction::Shl) // cttz + InitXNext = + Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1)); else llvm_unreachable("Unexpected opcode!"); } else InitXNext = InitX; - CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck); + FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID); Count = Builder.CreateSub( - ConstantInt::get(CTLZ->getType(), - CTLZ->getType()->getIntegerBitWidth()), - CTLZ); + ConstantInt::get(FFS->getType(), + FFS->getType()->getIntegerBitWidth()), + FFS); if (IsCntPhiUsedOutsideLoop) { CountPrev = Count; Count = Builder.CreateAdd( CountPrev, ConstantInt::get(CountPrev->getType(), 1)); } - if (IsCntPhiUsedOutsideLoop) - NewCount = Builder.CreateZExtOrTrunc(CountPrev, - cast(CntInst->getType())); - else - NewCount = Builder.CreateZExtOrTrunc(Count, - cast(CntInst->getType())); - // If the CTLZ counter's initial value is not zero, insert Add Inst. + NewCount = Builder.CreateZExtOrTrunc( + IsCntPhiUsedOutsideLoop ? CountPrev : Count, + cast(CntInst->getType())); + + // If the counter's initial value is not zero, insert Add Inst. Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader); ConstantInt *InitConst = dyn_cast(CntInitVal); if (!InitConst || !InitConst->isZero()) @@ -1639,8 +1661,7 @@ LbCond->setOperand(1, ConstantInt::get(Ty, 0)); // Step 3: All the references to the original counter outside - // the loop are replaced with the NewCount -- the value returned from - // __builtin_ctlz(x). + // the loop are replaced with the NewCount if (IsCntPhiUsedOutsideLoop) CntPhi->replaceUsesOutsideBlock(NewCount, Body); else Index: llvm/trunk/test/Transforms/LoopIdiom/X86/cttz.ll =================================================================== --- llvm/trunk/test/Transforms/LoopIdiom/X86/cttz.ll +++ llvm/trunk/test/Transforms/LoopIdiom/X86/cttz.ll @@ -0,0 +1,82 @@ +; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=core-avx2 < %s -S | FileCheck --check-prefix=ALL %s +; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=corei7 < %s -S | FileCheck --check-prefix=ALL %s + +; Recognize CTTZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int cttz_zero_check(int n) +; { +; int i = 0; +; while(n) { +; n <<= 1; +; i++; +; } +; return i; +; } +; +; ALL-LABEL: @cttz_zero_check +; ALL: %0 = call i32 @llvm.cttz.i32(i32 %n, i1 true) +; ALL-NEXT: %1 = sub i32 32, %0 +; +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @cttz_zero_check(i32 %n) { +entry: + %tobool4 = icmp eq i32 %n, 0 + br i1 %tobool4, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %n.addr.05 = phi i32 [ %shl, %while.body ], [ %n, %while.body.preheader ] + %shl = shl i32 %n.addr.05, 1 + %inc = add nsw i32 %i.06, 1 + %tobool = icmp eq i32 %shl, 0 + br i1 %tobool, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ] + ret i32 %i.0.lcssa +} + +; Recognize CTTZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int cttz(int n) +; { +; int i = 0; +; while(n <<= 1) { +; i++; +; } +; return i; +; } +; +; ALL-LABEL: @cttz +; ALL: %0 = shl i32 %n, 1 +; ALL-NEXT: %1 = call i32 @llvm.cttz.i32(i32 %0, i1 false) +; ALL-NEXT: %2 = sub i32 32, %1 +; ALL-NEXT: %3 = add i32 %2, 1 +; +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @cttz(i32 %n) { +entry: + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %n.addr.0 = phi i32 [ %n, %entry ], [ %shl, %while.cond ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ] + %shl = shl i32 %n.addr.0, 1 + %tobool = icmp eq i32 %shl, 0 + %inc = add nsw i32 %i.0, 1 + br i1 %tobool, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret i32 %i.0 +} +