Index: lib/Transforms/Scalar/LoopIdiomRecognize.cpp =================================================================== --- lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -189,8 +189,8 @@ PHINode *CntPhi, Value *Var); bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB, - Instruction *CntInst, PHINode *CntPhi, - Value *Var, Instruction *DefX, + uint64_t InitShift, Instruction *CntInst, + PHINode *CntPhi, Value *Var, Instruction *DefX, const DebugLoc &DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop); @@ -1112,14 +1112,15 @@ return recognizePopcount() || recognizeAndInsertFFS(); } + +using MatchResultFunction = + std::function; + /// Check if the given conditional branch is based on the comparison between /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is -/// true), the control yields to the loop entry. If the branch matches the -/// behavior, the variable involved in the comparison is returned. This function -/// will be called to see if the precondition and postcondition of the loop are -/// in desirable form. +/// true), the control yields to the loop entry. static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry, - bool JmpOnZero = false) { + bool JmpOnZero, MatchResultFunction MF) { if (!BI || !BI->isConditional()) return nullptr; @@ -1136,12 +1137,194 @@ if (JmpOnZero) std::swap(TrueSucc, FalseSucc); - ICmpInst::Predicate Pred = Cond->getPredicate(); - if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) || - (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry)) - return Cond->getOperand(0); + return MF(Cond, LoopEntry, TrueSucc, FalseSucc); +} - return nullptr; +/// If the branch matches the behavior, the variable involved in the comparison +/// is returned. +static Value *matchConditionRetVar(BranchInst *BI, BasicBlock *LoopEntry, + bool JmpOnZero = false) { + auto MRF = [](ICmpInst *Cond, BasicBlock *LoopEntry, + BasicBlock *TrueSucc, BasicBlock *FalseSucc) -> Value* { + ICmpInst::Predicate Pred = Cond->getPredicate(); + if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) || + (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry)) + return Cond->getOperand(0); + return nullptr; + }; + return matchCondition(BI, LoopEntry, JmpOnZero, MRF); +} + +/// If the branch matches the behavior, return the successor block that variable +/// is not zero. +static BasicBlock *matchConditionRetNonZeroBlock(BranchInst *BI, + bool JmpOnZero = false) { + auto MRF = [](ICmpInst *Cond, BasicBlock *, + BasicBlock *TrueSucc, BasicBlock *FalseSucc) -> Value* { + ICmpInst::Predicate Pred = Cond->getPredicate(); + if (Pred == ICmpInst::ICMP_NE) + return TrueSucc; + if (Pred == ICmpInst::ICMP_EQ) + return FalseSucc; + return nullptr; + }; + return cast_or_null(matchCondition(BI, nullptr, JmpOnZero, MRF)); +} + +/// The core idiom we are trying to detect is: +/// \code +/// if (x == 0) +/// goto loop-exit +/// +/// if (at least number of leading (ctlz) or trailing (cttz) +/// bits of x are zero) +/// goto loop-exit +/// +/// cnt0 = init-val; +/// bitmask0 = init-val2; // must be power-of-2 +/// do { +/// cnt = phi(cnt0, cnt.next); // CntPhi +/// bitmaskphi = phi(bitmask0, bitmask); +/// +/// cnt.next = cnt + 1; // CntInst +/// ... +/// bitmask = bitmaskphi {<<|>>} 1 ; // BitMask, shift direction decides +/// // ctlz or cttz +/// ... +/// bitval = x & bitmask; +/// } while(bitval == 0); +/// +/// loop-exit: +/// \endcode +/// TODO: Handle the case where one operand is BitMaskPhi instead of BitMask. +static bool matchBitMaskShift(Instruction *BitMask, BasicBlock *LoopPreheader, + IntegerType *VarXTy, Intrinsic::ID &IntrinID, + uint64_t &InitShift) { + PHINode *BitMaskPhi = dyn_cast(BitMask->getOperand(0)); + if (!BitMaskPhi) + return false; + + // Check the recurrence on the bitmask + if (BitMaskPhi->getIncomingValueForBlock(BitMask->getParent()) != BitMask) + return false; + + // Check always shift by one + ConstantInt *Inc = dyn_cast(BitMask->getOperand(1)); + if (!Inc || !Inc->isOne()) + return false; + + // bitmask init val must be power-of-2 + ConstantInt *BitMaskPhiInit = + dyn_cast(BitMaskPhi->getIncomingValueForBlock(LoopPreheader)); + if (!BitMaskPhiInit || !BitMaskPhiInit->getValue().isPowerOf2()) + return false; + + // If shifting right and BitMaskPhiInit equals SignBit, it must be LShr + if (BitMask->getOpcode() == Instruction::AShr && + BitMaskPhiInit->getValue().isNegative()) + return false; + + // Make sure the inital BitMask is valid by validatig its number of trailing + // zeros. + const bool IsShiftLeft = BitMask->getOpcode() == Instruction::Shl; + InitShift = BitMaskPhiInit->getValue().countTrailingZeros() + + (IsShiftLeft ? 1 : -1); // Factor in the first shift + if (InitShift >= VarXTy->getBitWidth()) + return false; + + IntrinID = IsShiftLeft ? Intrinsic::cttz : Intrinsic::ctlz; + + // For ctlz, InitShift should be counted from left + if (IntrinID == Intrinsic::ctlz) + InitShift = VarXTy->getBitWidth() - 1 - InitShift; + + return true; +} + +/// The core idiom we are trying to detect is: +/// \code +/// if (x == 0) +/// goto loop-exit +/// +/// if (at least number of leading (ctlz) or trailing (cttz) +/// bits of x are zero) +/// goto loop-exit +/// +/// cnt0 = init-val; +/// shift0 = init-val2; +/// do { +/// cnt = phi(cnt0, cnt.next); // CntPhi +/// shift = phi(shift0, shift.next); +/// +/// cnt.next = cnt + 1; // CntInst +/// ... +/// shift.next = shift {+,-} 1; +/// ... +/// bitmask = {<<|>>} shift.next; +/// ... +/// bitval = x & bitmask; +/// } while(bitval == 0); +/// +/// loop-exit: +/// \endcode +static bool matchBitMaskIdxShift(Instruction *BitMask, ScalarEvolution *SE, + PHINode *CntPhi, IntegerType *VarXTy, + Intrinsic::ID &IntrinID, uint64_t &InitShift, + size_t &IdiomCanonicalSize) { + // First operand of BitMask is power-of-2 constant integer + // Second operand is either CntPhi or CntInst + ConstantInt *BitMaskConstInt = dyn_cast(BitMask->getOperand(0)); + if (!BitMaskConstInt || !BitMaskConstInt->getValue().isPowerOf2()) + return false; + + // If shifting right and BitMaskPhiInit equals SignBit, it must be LShr + if (BitMask->getOpcode() == Instruction::AShr && + BitMaskConstInt->getValue() == + APInt::getSignMask(BitMaskConstInt->getBitWidth())) + return false; + + const SCEVAddRecExpr *ShiftSCEV = + dyn_cast(SE->getSCEV(BitMask->getOperand(1))); + if (!ShiftSCEV || ShiftSCEV->getNumOperands() != 2) + return false; + + const SCEVConstant *Base = dyn_cast(ShiftSCEV->getOperand(0)); + const SCEVConstant *Stride = dyn_cast(ShiftSCEV->getOperand(1)); + if (!Base || !Stride || + (!Stride->isOne() && !Stride->getValue()->isMinusOne())) + return false; + + APInt Init = Base->getAPInt(); + if (BitMask->getOpcode() == Instruction::Shl) + Init = BitMaskConstInt->getValue().shl(Init); + else if (BitMask->getOpcode() == Instruction::LShr) + Init = BitMaskConstInt->getValue().lshr(Init); + else if (BitMask->getOpcode() == Instruction::AShr) + Init = BitMaskConstInt->getValue().ashr(Init); + + // Make sure the inital BitMask is valid + if (Init.countTrailingZeros() >= VarXTy->getBitWidth()) + return false; + + // Both stride and shift direction decide if cttz or ctlz + const bool PositiveStride = Stride->isOne(); + if (BitMask->getOpcode() == Instruction::Shl) + IntrinID = PositiveStride ? Intrinsic::cttz : Intrinsic::ctlz; + else + IntrinID = PositiveStride ? Intrinsic::ctlz : Intrinsic::cttz; + + // For ctlz, InitShift should be counted from left + InitShift = Init.countTrailingZeros(); + if (IntrinID == Intrinsic::ctlz) + InitShift = VarXTy->getBitWidth() - 1 - InitShift; + + // If IdxPhi is the same as CntPhi, IdiomCanonicalSize is 6. + PHINode *IdxPhi = dyn_cast(BitMask->getOperand(1)); + if (!IdxPhi) + IdxPhi = cast(cast(BitMask->getOperand(1))->getOperand(0)); + IdiomCanonicalSize = IdxPhi == CntPhi ? 6 : 8; + + return true; } // Check if the recurrence variable `VarX` is in the right form to create @@ -1155,6 +1338,38 @@ return nullptr; } +// Find the recurrence pattern cnt.next = cnt + 1 in a single block loop, +// return true if pattern is found and set CntInst / CntPhi, otherwise return +// return false. If multiple instances of this pattern exist, return the first +// one found. +static bool findConstantOneAddRecurrence(Instruction **CntInst, + PHINode **CntPhi, + BasicBlock *LoopEntry) { + for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), + IterE = LoopEntry->end(); + Iter != IterE; Iter++) { + Instruction *Inst = &*Iter; + if (Inst->getOpcode() != Instruction::Add) + continue; + + ConstantInt *Inc = dyn_cast(Inst->getOperand(1)); + if (!Inc || !Inc->isOne()) + continue; + + PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); + if (!Phi) + continue; + + if (CntInst) + *CntInst = Inst; + if (CntPhi) + *CntPhi = Phi; + return true; + } + + return false; +} + /// Return true iff the idiom is detected in the loop. /// /// Additionally: @@ -1196,7 +1411,7 @@ // step 1: Check if the loop-back branch is in desirable form. { - if (Value *T = matchCondition( + if (Value *T = matchConditionRetVar( dyn_cast(LoopEntry->getTerminator()), LoopEntry)) DefX2 = dyn_cast(T); else @@ -1275,7 +1490,7 @@ // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" { auto *PreCondBr = dyn_cast(PreCondBB->getTerminator()); - Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader()); + Value *T = matchConditionRetVar(PreCondBr, CurLoop->getLoopPreheader()); if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) return false; @@ -1287,6 +1502,154 @@ return true; } +/// The core idiom we are trying to detect is: +/// \code +/// if (x == 0) +/// goto loop-exit +/// +/// if (at least number of leading (ctlz) or trailing (cttz) +/// bits of x are zero) +/// goto loop-exit +/// +/// cnt0 = init-val; +/// do { +/// cnt = phi(cnt0, cnt.next); // CntPhi +/// +/// cnt.next = cnt + 1; // CntInst +/// ... +/// bitmask = ... ; // BitMask. Three kinds of scalar evolution are +/// // detected for it. See matchBitMaskIdxShift and +/// // matchBitMaskShift. +/// ... +/// bitval = x & bitmask; +/// } while(bitval == 0); +/// +/// loop-exit: +/// \endcode +/// TODO: detect multi-block version of bitscan where, instead of doing a zero +/// check before loop, doing a limit check for each loop iteration +static bool detectBitScanIdiom(Loop *CurLoop, DominatorTree *DT, + ScalarEvolution *SE, Intrinsic::ID &IntrinID, + uint64_t &InitShift, Value *&VarX, + Instruction *&CntInst, PHINode *&CntPhi, + size_t &IdiomCanonicalSize) { + BasicBlock *LoopPreheader = CurLoop->getLoopPreheader(); + BasicBlock *LoopEntry = *(CurLoop->block_begin()); + Instruction *BitVal = nullptr; + Instruction *BitMask = nullptr; + CntInst = nullptr; + CntPhi = nullptr; + + // Find CntInst and CntPhi. + if (!findConstantOneAddRecurrence(&CntInst, &CntPhi, LoopEntry)) + return false; + + // Check if the loop-back branch is in desirable form. + if (Value *T = matchConditionRetVar( + dyn_cast(LoopEntry->getTerminator()), LoopEntry, true)) + BitVal = dyn_cast(T); + else + return false; + + // BitVal must be bitwise and instruction + if (!BitVal || BitVal->getOpcode() != Instruction::And) + return false; + + // One operand of BitVal is BitMask, the other is VarX + VarX = BitVal->getOperand(0); + BitMask = dyn_cast(BitVal->getOperand(1)); + if (!CurLoop->isLoopInvariant(VarX)) { + VarX = BitVal->getOperand(1); + BitMask = dyn_cast(BitVal->getOperand(0)); + } + if (!CurLoop->isLoopInvariant(VarX) || !BitMask || + BitMask->getParent() != LoopEntry || !BitMask->isShift()) + return false; + + // VarX has integer type + IntegerType *VarXTy = dyn_cast(VarX->getType()); + if (!VarXTy) + return false; + + // Scalar evolution of BitMask decides if the idiom is ctlz or cttz + if (matchBitMaskShift(BitMask, LoopPreheader, VarXTy, IntrinID, InitShift)) + IdiomCanonicalSize = 7; + else if (!matchBitMaskIdxShift(BitMask, SE, CntPhi, VarXTy, + IntrinID, InitShift, IdiomCanonicalSize)) + return false; + + // Decide from CFG that if VarX is not zero and number of leading + // (ctlz) or trailing(cttz) bits are zero. + + // Find block to prove x != 0 + bool CheckVarXIsNotZero = false; + // Find block to prove x has num of leading / trailing bits clear + bool CheckVarXFirstBitsAreZero = InitShift == 0; + + for (Use &U : VarX->uses()) { + auto *UI = dyn_cast(U.getUser()); + if (!UI) + continue; + + BasicBlock *CurBB = UI->getParent(); + BranchInst *BI = dyn_cast(CurBB->getTerminator()); + if (!BI) + continue; + + if (auto *Cmp = dyn_cast(UI)) { + // CheckVarXIsNotZero + if (BasicBlock *BB = matchConditionRetNonZeroBlock(BI)) { + BasicBlockEdge BBE(CurBB, BB); + if (DT->dominates(BBE, LoopEntry)) + CheckVarXIsNotZero = true; + } + + // CheckVarXFirstBitsAreZero + // Detect %cmp = icmp sgt i32 %x, -1 as alternative to + // %and = and i32 %x, + // %cmp = icmp eq i32 %and, 0 + if (!CheckVarXFirstBitsAreZero && + IntrinID == Intrinsic::ctlz && + Cmp->getPredicate() == CmpInst::ICMP_SGT) + if (ConstantInt *CI = dyn_cast(UI->getOperand(1))) + if (CI->isMinusOne()) + CheckVarXFirstBitsAreZero = true; + } else if (!CheckVarXFirstBitsAreZero && + UI->getOpcode() == Instruction::And) { + // CheckVarXFirstBitsAreZero + // Check for pattern: + // %and = and i32 %x, + // %cmp = icmp eq i32 %and, 0 + + ConstantInt *Mask = dyn_cast(UI->getOperand(1)); + if (!Mask) + continue; + + // Check the constant value of Mask such that AT LEAST leading / trailing + // InitShift number of bits are set, and the rest of bits are clear. + + // TheMask = Mask + 1 + APInt TheMask = Mask->getValue(); + if (IntrinID == Intrinsic::ctlz) // Check high bits of Mask if ctlz + TheMask = TheMask.reverseBits(); + TheMask = TheMask + 1; + + if (TheMask.isPowerOf2() && TheMask.countTrailingZeros() >= InitShift) + if (BasicBlock *BB = matchConditionRetNonZeroBlock(BI, true)) { + BasicBlockEdge BBE(CurBB, BB); + if (DT->dominates(BBE, LoopEntry) && + cast(BI->getCondition())->getOperand(0) == UI) + CheckVarXFirstBitsAreZero = true; + } + } + } + + if (!CheckVarXIsNotZero || !CheckVarXFirstBitsAreZero) + return false; + + return true; +} + /// Return true if the idiom is detected in the loop. /// /// Additionally: @@ -1327,7 +1690,7 @@ LoopEntry = *(CurLoop->block_begin()); // step 1: Check if the loop-back branch is in desirable form. - if (Value *T = matchCondition( + if (Value *T = matchConditionRetVar( dyn_cast(LoopEntry->getTerminator()), LoopEntry)) DefX = dyn_cast(T); else @@ -1361,26 +1724,7 @@ // plus "cnt0". Currently it is not optimized. // This step could be used to detect POPCNT instruction: // cnt.next = cnt + (x.next & 1) - for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), - IterE = LoopEntry->end(); - Iter != IterE; Iter++) { - Instruction *Inst = &*Iter; - if (Inst->getOpcode() != Instruction::Add) - continue; - - ConstantInt *Inc = dyn_cast(Inst->getOperand(1)); - if (!Inc || !Inc->isOne()) - continue; - - PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); - if (!Phi) - continue; - - CntInst = Inst; - CntPhi = Phi; - break; - } - if (!CntInst) + if (!findConstantOneAddRecurrence(&CntInst, &CntPhi, LoopEntry)) return false; return true; @@ -1389,6 +1733,7 @@ /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new /// trip count returns true; otherwise, returns false. +/// For either, two kinds of idioms are detected: ShiftUntilZero or BitScan. bool LoopIdiomRecognize::recognizeAndInsertFFS() { // Give up if the loop has multiple blocks or multiple backedges. if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) @@ -1400,13 +1745,23 @@ PHINode *CntPhi = nullptr; Instruction *CntInst = nullptr; // Help decide if transformation is profitable. For ShiftUntilZero idiom, - // this is always 6. + // this is always 6. For BitScan idiom, this could be 6,7 or 8. size_t IdiomCanonicalSize = 6; - - if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, - CntInst, CntPhi, DefX)) + // For BitScan idiom only + uint64_t InitShift; + + bool FoundBitScanIdiom; + if (detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, + CntInst, CntPhi, DefX)) + FoundBitScanIdiom = false; + else if (detectBitScanIdiom(CurLoop, DT, SE, IntrinID, InitShift, InitX, + CntInst, CntPhi, IdiomCanonicalSize)) + FoundBitScanIdiom = true; + else return false; + // FoundBitScanIdiom is false if detect ShiftUntilZero idiom + bool IsCntPhiUsedOutsideLoop = false; for (User *U : CntPhi->users()) if (!CurLoop->contains(cast(U))) { @@ -1427,7 +1782,8 @@ // For some CPUs result of CTLZ(X) intrinsic is undefined // when X is 0. If we can not guarantee X != 0, we need to check this // when expand. - bool ZeroCheck = false; + bool ZeroCheck = FoundBitScanIdiom; + // It is safe to assume Preheader exist as it was checked in // parent function RunOnLoop. BasicBlock *PH = CurLoop->getLoopPreheader(); @@ -1436,14 +1792,14 @@ // have a zero check as a precondition. Without the check the loop would run // one iteration for before any check of the input value. This means 0 and 1 // would have identical behavior in the original loop and thus - if (!IsCntPhiUsedOutsideLoop) { + if (!FoundBitScanIdiom && !IsCntPhiUsedOutsideLoop) { auto *PreCondBB = PH->getSinglePredecessor(); if (!PreCondBB) return false; auto *PreCondBI = dyn_cast(PreCondBB->getTerminator()); if (!PreCondBI) return false; - if (matchCondition(PreCondBI, PH) != InitX) + if (matchConditionRetVar(PreCondBI, PH) != InitX) return false; ZeroCheck = true; } @@ -1451,7 +1807,7 @@ // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always // profitable if we delete the loop. - // the loop has only 6 instructions: + // For ShiftUntilZero idiom, the loop has only 6 instructions: // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] // %shr = ashr %n.addr.0, 1 @@ -1467,9 +1823,9 @@ TargetTransformInfo::TCC_Basic) return false; - transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX, - DefX->getDebugLoc(), ZeroCheck, - IsCntPhiUsedOutsideLoop); + const DebugLoc &XDL = DefX ? DefX->getDebugLoc() : CntInst->getDebugLoc(); + transformLoopToCountable(IntrinID, PH, InitShift, CntInst, CntPhi, InitX, + DefX, XDL, ZeroCheck, IsCntPhiUsedOutsideLoop); return true; } @@ -1561,11 +1917,19 @@ /// Use(CntPhi) or Use(CntInst) /// /// Into: -/// If CntPhi used outside the loop: -/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1) -/// Count = CountPrev + 1 -/// else -/// Count = BitWidth(InitX) - CTLZ(InitX) +/// (ShiftUntilZero): +/// If CntPhi used outside the loop: +/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1) +/// Count = CountPrev + 1 +/// else +/// Count = BitWidth(InitX) - CTLZ(InitX) +/// +/// (BitScan): +/// If CntPhi used outside the loop: +/// CountPrev = CTLZ(InitX) - InitShift +/// Count = CountPrev + 1 +/// else +/// Count = CTLZ(InitX) - InitShift + 1 /// loop: /// CntPhi = PHI [Cnt0, CntInst] /// PhiX = PHI [InitX, DefX] @@ -1582,9 +1946,10 @@ /// If LOOP_BODY is empty the loop will be deleted. /// If CntInst and DefX are not used in LOOP_BODY they will be removed. void LoopIdiomRecognize::transformLoopToCountable( - Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst, - PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL, - bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { + Intrinsic::ID IntrinID, BasicBlock *Preheader, uint64_t InitShift, + Instruction *CntInst, PHINode *CntPhi, Value *InitX, Instruction *DefX, + const DebugLoc &DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { + const bool FoundBitScanIdiom = !DefX; BranchInst *PreheaderBr = cast(Preheader->getTerminator()); // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block @@ -1592,33 +1957,43 @@ Builder.SetCurrentDebugLocation(DL); Value *FFS, *Count, *CountPrev, *NewCount, *InitXNext; - // Count = BitWidth - CTLZ(InitX); - // If there are uses of CntPhi create: - // CountPrev = BitWidth - CTLZ(InitX >> 1); - if (IsCntPhiUsedOutsideLoop) { - if (DefX->getOpcode() == Instruction::AShr) - InitXNext = - Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1)); - else if (DefX->getOpcode() == Instruction::LShr) - InitXNext = - Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1)); - else if (DefX->getOpcode() == Instruction::Shl) // cttz - InitXNext = - Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1)); - else - llvm_unreachable("Unexpected opcode!"); - } else - InitXNext = InitX; - FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID); - Count = Builder.CreateSub( - ConstantInt::get(FFS->getType(), - FFS->getType()->getIntegerBitWidth()), - FFS); - if (IsCntPhiUsedOutsideLoop) { - CountPrev = Count; - Count = Builder.CreateAdd( - CountPrev, - ConstantInt::get(CountPrev->getType(), 1)); + if (FoundBitScanIdiom) { + FFS = createFFSIntrinsic(Builder, InitX, DL, ZeroCheck, IntrinID); + Count = Builder.CreateSub(FFS, + ConstantInt::get(FFS->getType(), InitShift)); + if (IsCntPhiUsedOutsideLoop) + CountPrev = Count; + Count = Builder.CreateAdd(Count, + ConstantInt::get(Count->getType(), 1)); + } else { + // Count = BitWidth - CTLZ(InitX); + // If there are uses of CntPhi create: + // CountPrev = BitWidth - CTLZ(InitX >> 1); + if (IsCntPhiUsedOutsideLoop) { + if (DefX->getOpcode() == Instruction::AShr) + InitXNext = + Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1)); + else if (DefX->getOpcode() == Instruction::LShr) + InitXNext = + Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1)); + else if (DefX->getOpcode() == Instruction::Shl) // cttz + InitXNext = + Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1)); + else + llvm_unreachable("Unexpected opcode!"); + } else + InitXNext = InitX; + FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID); + Count = Builder.CreateSub( + ConstantInt::get(FFS->getType(), + FFS->getType()->getIntegerBitWidth()), + FFS); + if (IsCntPhiUsedOutsideLoop) { + CountPrev = Count; + Count = Builder.CreateAdd( + CountPrev, + ConstantInt::get(CountPrev->getType(), 1)); + } } NewCount = Builder.CreateZExtOrTrunc( Index: test/Transforms/LoopIdiom/X86/ctlz.ll =================================================================== --- test/Transforms/LoopIdiom/X86/ctlz.ll +++ test/Transforms/LoopIdiom/X86/ctlz.ll @@ -1,6 +1,361 @@ ; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=core-avx2 < %s -S | FileCheck -check-prefix=LZCNT --check-prefix=ALL %s ; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=corei7 < %s -S | FileCheck -check-prefix=NOLZCNT --check-prefix=ALL %s +; int ctlz_bitscan_shl(int x) { +; int count = 0; +; if (!x) return sizeof(x)*8; +; unsigned shift = sizeof(x)*8; +; while ((x & (1<< --shift)) == 0) { +; count++; +; } +; return count; +; } +; +; ALL-LABEL: @ctlz_bitscan_shl +; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %x, i1 true) +; ALL-NEXT: %1 = sub i32 %0, 0 +; ALL-NEXT: %2 = add i32 %1, 1 +; ALL-NEXT: %3 = add i32 %1, 10 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_shl(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + br label %while.cond + +while.cond: ; preds = %while.cond.preheader, %while.cond + %count.0 = phi i32 [ %inc, %while.cond ], [ 10, %while.cond.preheader ] + %shift.0 = phi i32 [ %dec, %while.cond ], [ 32, %while.cond.preheader ] + %dec = add i32 %shift.0, -1 + %shl = shl i32 1, %dec + %and = and i32 %shl, %x + %cmp = icmp eq i32 %and, 0 + %inc = add nuw nsw i32 %count.0, 1 + br i1 %cmp, label %while.cond, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.cond + %count.0.lcssa = phi i32 [ %count.0, %while.cond ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %entry + %retval.0 = phi i32 [ 32, %entry ], [ %count.0.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; int ctlz_bitscan_lshr(int x) { +; int count = 0; +; if (!x) return sizeof(x)*8; +; unsigned m = 0x80000000; +; int i = 0; +; while ((x & (m >> i)) == 0) { +; ++i; +; count++; +; } +; return count; +; } +; +; ALL-LABEL: @ctlz_bitscan_lshr +; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %x, i1 true) +; ALL-NEXT: %1 = sub i32 %0, 1 +; ALL-NEXT: %2 = add i32 %1, 1 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_lshr(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %cmp9 = icmp sgt i32 %x, -1 + br i1 %cmp9, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.011 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %i.011, 1 + %shr = lshr i32 1073741824, %i.011 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc1.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc1.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; ALL-LABEL: @ctlz_bitscan_ashr +; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %x, i1 true) +; ALL-NEXT: %1 = sub i32 %0, 1 +; ALL-NEXT: %2 = add i32 %1, 1 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_ashr(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %cmp9 = icmp sgt i32 %x, -1 + br i1 %cmp9, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.011 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %i.011, 1 + %shr = ashr i32 1073741824, %i.011 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc1.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc1.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; For AShr, left operand can not be sign mask. +; +; ALL-LABEL: @ctlz_bitscan_ashr_bad_mask +; ALL-NOT: @llvm.ctlz +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_ashr_bad_mask(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %cmp9 = icmp sgt i32 %x, -1 + br i1 %cmp9, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.011 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %i.011, 1 + %shr = ashr i32 -2147483648, %i.011 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc1.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc1.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; Mask iterate on itself. +; int ctlz_bitscan_lshr_phi(int x) { +; int count = 0; +; if (!x) return sizeof(x)*8; +; unsigned mask = 0x1 << (sizeof(x)*8-1); +; while ((x & mask) == 0) { +; count++; +; mask >>= 1; +; } +; return count; +; } +; +; ALL-LABEL: @ctlz_bitscan_lshr_phi +; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %x, i1 true) +; ALL-NEXT: %1 = sub i32 %0, 1 +; ALL-NEXT: %2 = add i32 %1, 1 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_lshr_phi(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %cmp6 = icmp sgt i32 %x, -1 + br i1 %cmp6, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %mask.08 = phi i32 [ %shr, %while.body ], [ -2147483648, %while.body.preheader ] + %count.07 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %count.07, 1 + %shr = lshr i32 %mask.08, 1 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; ALL-LABEL: @ctlz_bitscan_lshr_phi_bad_mask +; ALL-NOT: @llvm.ctlz +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_lshr_phi_bad_mask(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %cmp6 = icmp sgt i32 %x, -1 + br i1 %cmp6, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %mask.08 = phi i32 [ %shr, %while.body ], [ -2147483648, %while.body.preheader ] + %count.07 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %count.07, 1 + %shr = ashr i32 %mask.08, 1 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; Mask iterate on itself, starting from midpoint of %x +; +; ALL-LABEL: @ctlz_bitscan_mid +; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %x, i1 true) +; ALL-NEXT: %1 = sub i32 %0, 17 +; ALL-NEXT: %2 = add i32 %1, 1 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_mid(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %and1 = and i32 %x, 4294950912 + %cmp6 = icmp eq i32 %and1, 0 + br i1 %cmp6, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %mask.08 = phi i32 [ %shr, %while.body ], [ 32768, %while.body.preheader ] + %count.07 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %count.07, 1 + %shr = lshr i32 %mask.08, 1 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; Mask iterate on itself, starting from midpoint of %x. However, block +; %while.cond.preheader only prove high 12 bits are zero, instead of 16 bits +; required by 65536. No ctlz pattern here. +; +; ALL-LABEL: @ctlz_bitscan_mid_first_bits_check_fail +; ALL-NOT: @llvm.ctlz +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_mid_first_bits_check_fail(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %and1 = and i32 %x, 4293918720 + %cmp6 = icmp eq i32 %and1, 0 + br i1 %cmp6, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %mask.08 = phi i32 [ %shr, %while.body ], [ 65536, %while.body.preheader ] + %count.07 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %count.07, 1 + %shr = lshr i32 %mask.08, 1 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; %x could be zero. No ctlz pattern here. +; +; ALL-LABEL: @ctlz_bitscan_lshr_zero_check_fail +; ALL-NOT: @llvm.ctlz +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_lshr_zero_check_fail(i32 %x) { +entry: + br label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %cmp9 = icmp sgt i32 %x, -1 + br i1 %cmp9, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.011 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %i.011, 1 + %shr = lshr i32 1073741824, %i.011 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc1.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 0, %while.cond.preheader ], [ %inc1.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + ; Recognize CTLZ builtin pattern. ; Here we'll just convert loop to countable, ; so do not insert builtin if CPU do not support CTLZ @@ -16,7 +371,7 @@ ; return i; ; } ; -; LZCNT: entry +; LZCNT-LABEL: @ctlz_and_other ; LZCNT: %0 = call i32 @llvm.ctlz.i32(i32 %shr8, i1 true) ; LZCNT-NEXT: %1 = sub i32 32, %0 ; LZCNT-NEXT: %2 = zext i32 %1 to i64 @@ -25,8 +380,8 @@ ; LZCNT: %i.0.lcssa = phi i32 [ 0, %entry ], [ %4, %while.end.loopexit ] ; LZCNT: ret i32 %i.0.lcssa -; NOLZCNT: entry -; NOLZCNT-NOT: @llvm.ctlz +; NOLZCNT-LABEL: @ctlz_and_other +; NOLZCNT-NOT: @llvm.ctlz ; Function Attrs: norecurse nounwind uwtable define i32 @ctlz_and_other(i32 %n, i8* nocapture %a) { @@ -71,7 +426,7 @@ ; ; int ctlz_zero_check(int n) ; { -; n = n >= 0 ? n : -n; +; n = n > 0 ? n : -n; ; int i = 0; ; while(n) { ; n >>= 1; @@ -80,7 +435,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz_zero_check ; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %abs_n, i1 true) ; ALL-NEXT: %1 = sub i32 32, %0 ; ALL: %inc.lcssa = phi i32 [ %1, %while.body ] @@ -129,7 +484,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz_zero_check_lshr ; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %n, i1 true) ; ALL-NEXT: %1 = sub i32 32, %0 ; ALL: %inc.lcssa = phi i32 [ %1, %while.body ] @@ -175,7 +530,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz ; ALL: %0 = ashr i32 %abs_n, 1 ; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) ; ALL-NEXT: %2 = sub i32 32, %1 @@ -216,7 +571,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz_lshr ; ALL: %0 = lshr i32 %n, 1 ; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) ; ALL-NEXT: %2 = sub i32 32, %1 @@ -255,7 +610,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz_add ; ALL: %0 = ashr i32 %abs_n, 1 ; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) ; ALL-NEXT: %2 = sub i32 32, %1 @@ -297,7 +652,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz_add_lshr ; ALL: %0 = lshr i32 %n, 1 ; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) ; ALL-NEXT: %2 = sub i32 32, %1 @@ -339,7 +694,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz_sext ; ALL: %0 = ashr i32 %abs_n, 1 ; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) ; ALL-NEXT: %2 = sub i32 32, %1 @@ -381,7 +736,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz_sext_lshr ; ALL: %0 = lshr i32 %n, 1 ; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) ; ALL-NEXT: %2 = sub i32 32, %1 Index: test/Transforms/LoopIdiom/X86/cttz.ll =================================================================== --- test/Transforms/LoopIdiom/X86/cttz.ll +++ test/Transforms/LoopIdiom/X86/cttz.ll @@ -1,6 +1,199 @@ ; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=core-avx2 < %s -S | FileCheck --check-prefix=ALL %s ; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=corei7 < %s -S | FileCheck --check-prefix=ALL %s +; int cttz_bitscan_shl(int x) { +; int count = 0; +; if (!x) return sizeof(x)*8; +; int i = 0; +; while (((x >> i) & 0x1) == 0) { +; i++; +; count++; +; } +; return count; +; } +; +; ALL-LABEL: @cttz_bitscan_shl +; ALL: %1 = call i32 @llvm.cttz.i32(i32 %x, i1 true) +; ALL-NEXT: %2 = sub i32 %1, 1 +; ALL-NEXT: %3 = add i32 %2, 1 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @cttz_bitscan_shl(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %0 = and i32 %x, 1 + %cmp8 = icmp eq i32 %0, 0 + br i1 %cmp8, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.010 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %i.010, 1 + %1 = shl i32 2, %i.010 + %2 = and i32 %1, %x + %cmp = icmp eq i32 %2, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc1.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc1.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; int cttz_bitscan_shl_phi(int x) { +; int count = 0; +; if (!x) return sizeof(x)*8; +; unsigned i = 1; +; while ((x & i) == 0) { +; i <<= 1; +; count++; +; } +; return count; +; } +; +; ALL-LABEL: @cttz_bitscan_shl_phi +; ALL: %0 = call i32 @llvm.cttz.i32(i32 %x, i1 true) +; ALL-NEXT: %1 = sub i32 %0, 1 +; ALL-NEXT: %2 = add i32 %1, 1 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @cttz_bitscan_shl_phi(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %and6 = and i32 %x, 1 + %cmp7 = icmp eq i32 %and6, 0 + br i1 %cmp7, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.09 = phi i32 [ %shl, %while.body ], [ 1, %while.body.preheader ] + %count.08 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %shl = shl i32 %i.09, 1 + %inc = add nuw nsw i32 %count.08, 1 + %and = and i32 %shl, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; int cttz_bitscan_shl_phi_non_constant_count(int x) { +; int count = x; +; if (!x) return sizeof(x)*8; +; unsigned i = 1; +; while ((x & i) == 0) { +; i <<= 1; +; count++; +; } +; return count; +; } +; +; ALL-LABEL: @cttz_bitscan_shl_phi_non_constant_count +; ALL: %0 = call i32 @llvm.cttz.i32(i32 %x, i1 true) +; ALL-NEXT: %1 = sub i32 %0, 1 +; ALL-NEXT: %2 = add i32 %1, 1 +; ALL-NEXT: %3 = add i32 %2, %x +; +define i32 @cttz_bitscan_shl_phi_non_constant_count(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %and6 = and i32 %x, 1 + %cmp7 = icmp eq i32 %and6, 0 + br i1 %cmp7, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.09 = phi i32 [ %shl, %while.body ], [ 1, %while.body.preheader ] + %count.08 = phi i32 [ %inc, %while.body ], [ %x, %while.body.preheader ] + %shl = shl i32 %i.09, 1 + %inc = add nuw nsw i32 %count.08, 1 + %and = and i32 %shl, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; int cttz_bitscan_lshr(int x) { +; int count = 0; +; if (!x) return sizeof(x)*8; +; unsigned m = 0x80000000; +; int i = 31; +; while (((m >> i) & x) == 0) { +; --i; +; count++; +; } +; return count; +; } +; +; ALL-LABEL: @cttz_bitscan_lshr +; ALL: %0 = call i32 @llvm.cttz.i32(i32 %x, i1 true) +; ALL-NEXT: %1 = sub i32 %0, 1 +; ALL-NEXT: %2 = add i32 %1, 1 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @cttz_bitscan_lshr(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %and7 = and i32 %x, 1 + %cmp8 = icmp eq i32 %and7, 0 + br i1 %cmp8, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.010 = phi i32 [ %dec, %while.body ], [ 31, %while.body.preheader ] + %count.09 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %dec = add nsw i32 %i.010, -1 + %inc = add nuw nsw i32 %count.09, 1 + %shr = lshr i32 -2147483648, %dec + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + ; Recognize CTTZ builtin pattern. ; Here it will replace the loop - ; assume builtin is always profitable.