Index: lib/Transforms/Scalar/LoopIdiomRecognize.cpp =================================================================== --- lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -26,7 +26,7 @@ // Future floating point idioms to recognize in -ffast-math mode: // fpowi // Future integer operation idioms to recognize: -// ctpop, ctlz, cttz +// ctpop // // Beware that isel's default lowering for ctpop is highly inefficient for // i64 and larger types when i64 is legal and the value has few bits set. It @@ -187,8 +187,9 @@ bool recognizePopcount(); void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var); - bool recognizeAndInsertCTLZ(); - void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst, + bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz + void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB, + uint64_t InitShift, Instruction *CntInst, PHINode *CntPhi, Value *Var, Instruction *DefX, const DebugLoc &DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop); @@ -1107,15 +1108,18 @@ } bool LoopIdiomRecognize::runOnNoncountableLoop() { - return recognizePopcount() || recognizeAndInsertCTLZ(); + return recognizePopcount() || recognizeAndInsertFFS(); } + +using MatchResultFunction = + std::function; + /// Check if the given conditional branch is based on the comparison between -/// a variable and zero, and if the variable is non-zero, the control yields to -/// the loop entry. If the branch matches the behavior, the variable involved -/// in the comparison is returned. This function will be called to see if the -/// precondition and postcondition of the loop are in desirable form. -static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) { +/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is +/// true), the control yields to the loop entry. +static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry, + bool JmpOnZero, MatchResultFunction MF) { if (!BI || !BI->isConditional()) return nullptr; @@ -1127,12 +1131,199 @@ if (!CmpZero || !CmpZero->isZero()) return nullptr; - ICmpInst::Predicate Pred = Cond->getPredicate(); - if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) || - (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry)) - return Cond->getOperand(0); + BasicBlock *TrueSucc = BI->getSuccessor(0); + BasicBlock *FalseSucc = BI->getSuccessor(1); + if (JmpOnZero) + std::swap(TrueSucc, FalseSucc); - return nullptr; + return MF(Cond, LoopEntry, TrueSucc, FalseSucc); +} + +/// If the branch matches the behavior, the variable involved in the comparison +/// is returned. +static Value *matchConditionRetVar(BranchInst *BI, BasicBlock *LoopEntry, + bool JmpOnZero = false) { + auto MRF = [](ICmpInst *Cond, BasicBlock *LoopEntry, + BasicBlock *TrueSucc, BasicBlock *FalseSucc) -> Value* { + ICmpInst::Predicate Pred = Cond->getPredicate(); + if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) || + (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry)) + return Cond->getOperand(0); + return nullptr; + }; + return matchCondition(BI, LoopEntry, JmpOnZero, MRF); +} + +/// If the branch matches the behavior, return the successor block that variable +/// is not zero. +static BasicBlock *matchConditionRetNonZeroBlock(BranchInst *BI, + bool JmpOnZero = false) { + auto MRF = [](ICmpInst *Cond, BasicBlock *, + BasicBlock *TrueSucc, BasicBlock *FalseSucc) -> Value* { + ICmpInst::Predicate Pred = Cond->getPredicate(); + if (Pred == ICmpInst::ICMP_NE) + return TrueSucc; + if (Pred == ICmpInst::ICMP_EQ) + return FalseSucc; + return nullptr; + }; + return cast_or_null(matchCondition(BI, nullptr, JmpOnZero, MRF)); +} + +/// The core idiom we are trying to detect is: +/// \code +/// if (x == 0) +/// goto loop-exit +/// +/// if (at least number of leading (ctlz) or trailing (cttz) +/// bits of x are zero) +/// goto loop-exit +/// +/// cnt0 = init-val; +/// bitmask0 = init-val2; // must be power-of-2 +/// do { +/// cnt = phi(cnt0, cnt.next); // CntPhi +/// bitmaskphi = phi(bitmask0, bitmask); +/// +/// cnt.next = cnt + 1; // CntInst +/// ... +/// bitmask = bitmaskphi {<<|>>} 1 ; // BitMask, shift direction decides +/// // ctlz or cttz +/// ... +/// bitval = x & bitmask; +/// } while(bitval == 0); +/// +/// loop-exit: +/// \endcode +/// TODO: Handle the case where one operand is BitMaskPhi instead of BitMask. +static bool matchBitMaskShift(Instruction *BitMask, BasicBlock *LoopPreheader, + IntegerType *VarXTy, Intrinsic::ID &IntrinID, + uint64_t &InitShift) { + PHINode *BitMaskPhi = dyn_cast(BitMask->getOperand(0)); + if (!BitMaskPhi) + return false; + + // Check the recurrence on the bitmask + if (BitMaskPhi->getIncomingValueForBlock(BitMask->getParent()) != BitMask) + return false; + + // Check always shift by one + ConstantInt *Inc = dyn_cast(BitMask->getOperand(1)); + if (!Inc || !Inc->isOne()) + return false; + + // bitmask init val must be power-of-2 + ConstantInt *BitMaskPhiInit = + dyn_cast(BitMaskPhi->getIncomingValueForBlock(LoopPreheader)); + if (!BitMaskPhiInit || !BitMaskPhiInit->getValue().isPowerOf2()) + return false; + + // If shifting right and BitMaskPhiInit equals SignBit, it must be LShr + if (BitMask->getOpcode() == Instruction::AShr && + BitMaskPhiInit->getValue().isNegative()) + return false; + + // Make sure the inital BitMask is valid by validatig its number of trailing + // zeros. + const bool IsShiftLeft = BitMask->getOpcode() == Instruction::Shl; + InitShift = BitMaskPhiInit->getValue().countTrailingZeros() + + (IsShiftLeft ? 1 : -1); // Factor in the first shift + if (InitShift >= VarXTy->getBitWidth()) + return false; + + IntrinID = IsShiftLeft ? Intrinsic::cttz : Intrinsic::ctlz; + + // For ctlz, InitShift should be counted from left + if (IntrinID == Intrinsic::ctlz) + InitShift = VarXTy->getBitWidth() - 1 - InitShift; + + return true; +} + +/// The core idiom we are trying to detect is: +/// \code +/// if (x == 0) +/// goto loop-exit +/// +/// if (at least number of leading (ctlz) or trailing (cttz) +/// bits of x are zero) +/// goto loop-exit +/// +/// cnt0 = init-val; +/// shift0 = init-val2; +/// do { +/// cnt = phi(cnt0, cnt.next); // CntPhi +/// shift = phi(shift0, shift.next); +/// +/// cnt.next = cnt + 1; // CntInst +/// ... +/// shift.next = shift {+,-} 1; +/// ... +/// bitmask = {<<|>>} shift.next; +/// ... +/// bitval = x & bitmask; +/// } while(bitval == 0); +/// +/// loop-exit: +/// \endcode +static bool matchBitMaskIdxShift(Instruction *BitMask, ScalarEvolution *SE, + PHINode *CntPhi, IntegerType *VarXTy, + Intrinsic::ID &IntrinID, uint64_t &InitShift, + size_t &IdiomCanonicalSize) { + // First operand of BitMask is power-of-2 constant integer + // Second operand is either CntPhi or CntInst + ConstantInt *BitMaskConstInt = dyn_cast(BitMask->getOperand(0)); + if (!BitMaskConstInt || !BitMaskConstInt->getValue().isPowerOf2()) + return false; + + // If shifting right and BitMaskPhiInit equals SignBit, it must be LShr + if (BitMask->getOpcode() == Instruction::AShr && + BitMaskConstInt->getValue() == + APInt::getSignMask(BitMaskConstInt->getBitWidth())) + return false; + + const SCEVAddRecExpr *ShiftSCEV = + dyn_cast(SE->getSCEV(BitMask->getOperand(1))); + if (!ShiftSCEV || ShiftSCEV->getNumOperands() != 2) + return false; + + const SCEVConstant *Base = dyn_cast(ShiftSCEV->getOperand(0)); + const SCEVConstant *Stride = dyn_cast(ShiftSCEV->getOperand(1)); + if (!Base || !Stride || + (!Stride->isOne() && !Stride->getValue()->isMinusOne())) + return false; + + APInt Init = Base->getAPInt(); + if (BitMask->getOpcode() == Instruction::Shl) + Init = BitMaskConstInt->getValue().shl(Init); + else if (BitMask->getOpcode() == Instruction::LShr) + Init = BitMaskConstInt->getValue().lshr(Init); + else if (BitMask->getOpcode() == Instruction::AShr) + Init = BitMaskConstInt->getValue().ashr(Init); + + // Make sure the inital BitMask is valid + if (Init.countTrailingZeros() >= VarXTy->getBitWidth()) + return false; + + // Both stride and shift direction decide if cttz or ctlz + const bool PositiveStride = Stride->isOne(); + if (BitMask->getOpcode() == Instruction::Shl) + IntrinID = PositiveStride ? Intrinsic::cttz : Intrinsic::ctlz; + else + IntrinID = PositiveStride ? Intrinsic::ctlz : Intrinsic::cttz; + + // For ctlz, InitShift should be counted from left + InitShift = Init.countTrailingZeros(); + if (IntrinID == Intrinsic::ctlz) + InitShift = VarXTy->getBitWidth() - 1 - InitShift; + + // If IdxPhi is the same as CntPhi, IdiomCanonicalSize is 6. + PHINode *IdxPhi = dyn_cast(BitMask->getOperand(1)); + if (!IdxPhi) + IdxPhi = cast(cast(BitMask->getOperand(1))->getOperand(0)); + IdiomCanonicalSize = IdxPhi == CntPhi ? 6 : 8; + + return true; } // Check if the recurrence variable `VarX` is in the right form to create @@ -1187,7 +1378,7 @@ // step 1: Check if the loop-back branch is in desirable form. { - if (Value *T = matchCondition( + if (Value *T = matchConditionRetVar( dyn_cast(LoopEntry->getTerminator()), LoopEntry)) DefX2 = dyn_cast(T); else @@ -1266,7 +1457,7 @@ // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" { auto *PreCondBr = dyn_cast(PreCondBB->getTerminator()); - Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader()); + Value *T = matchConditionRetVar(PreCondBr, CurLoop->getLoopPreheader()); if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) return false; @@ -1278,6 +1469,175 @@ return true; } +/// The core idiom we are trying to detect is: +/// \code +/// if (x == 0) +/// goto loop-exit +/// +/// if (at least number of leading (ctlz) or trailing (cttz) +/// bits of x are zero) +/// goto loop-exit +/// +/// cnt0 = init-val; +/// do { +/// cnt = phi(cnt0, cnt.next); // CntPhi +/// +/// cnt.next = cnt + 1; // CntInst +/// ... +/// bitmask = ... ; // BitMask. Three kinds of scalar evolution are +/// // detected for it. See matchBitMaskIdxShift and +/// // matchBitMaskShift. +/// ... +/// bitval = x & bitmask; +/// } while(bitval == 0); +/// +/// loop-exit: +/// \endcode +/// TODO: detect multi-block version of bitscan where, instead of doing a zero +/// check before loop, doing a limit check for each loop iteration +static bool detectBitScanIdiom(Loop *CurLoop, DominatorTree *DT, + ScalarEvolution *SE, Intrinsic::ID &IntrinID, + uint64_t &InitShift, Value *&VarX, + Instruction *&CntInst, PHINode *&CntPhi, + size_t &IdiomCanonicalSize) { + BasicBlock *LoopPreheader = CurLoop->getLoopPreheader(); + BasicBlock *LoopEntry = *(CurLoop->block_begin()); + Instruction *BitVal = nullptr; + Instruction *BitMask = nullptr; + CntPhi = nullptr; + CntInst = nullptr; + + // Find CntPhi as the phi SCEV that is {base,+,1}, base must be constant + for (auto I = LoopEntry->begin(), + E = LoopEntry->getFirstNonPHI()->getIterator(); I != E; ++I) { + PHINode *P = cast(&*I); + if (!SE->isSCEVable(P->getType())) + continue; + const SCEVAddRecExpr *Count = dyn_cast(SE->getSCEV(P)); + if (!Count || Count->getNumOperands() != 2) + continue; + + const SCEVConstant *Base = dyn_cast(Count->getOperand(0)); + const SCEVConstant *Stride = dyn_cast(Count->getOperand(1)); + if (Base && Stride && Stride->isOne()) { + CntPhi = P; + break; + } + } + + if (!CntPhi) + return false; + + // Get CntInst + CntInst = cast(CntPhi->getIncomingValueForBlock(LoopEntry)); + assert (CntInst->getOpcode() == Instruction::Add); + + // Check if the loop-back branch is in desirable form. + if (Value *T = matchConditionRetVar( + dyn_cast(LoopEntry->getTerminator()), LoopEntry, true)) + BitVal = dyn_cast(T); + else + return false; + + // BitVal must be bitwise and instruction + if (!BitVal || BitVal->getOpcode() != Instruction::And) + return false; + + // One operand of BitVal is BitMask, the other is VarX + VarX = BitVal->getOperand(0); + BitMask = dyn_cast(BitVal->getOperand(1)); + if (!CurLoop->isLoopInvariant(VarX)) { + VarX = BitVal->getOperand(1); + BitMask = dyn_cast(BitVal->getOperand(0)); + } + if (!CurLoop->isLoopInvariant(VarX) || !BitMask || + BitMask->getParent() != LoopEntry || !BitMask->isShift()) + return false; + + // VarX has integer type + IntegerType *VarXTy = dyn_cast(VarX->getType()); + if (!VarXTy) + return false; + + // Scalar evolution of BitMask decides if the idiom is ctlz or cttz + if (matchBitMaskShift(BitMask, LoopPreheader, VarXTy, IntrinID, InitShift)) + IdiomCanonicalSize = 7; + else if (!matchBitMaskIdxShift(BitMask, SE, CntPhi, VarXTy, + IntrinID, InitShift, IdiomCanonicalSize)) + return false; + + // Decide from CFG that if VarX is not zero and number of leading + // (ctlz) or trailing(cttz) bits are zero. + + // Find block to prove x != 0 + bool CheckVarXIsNotZero = false; + // Find block to prove x has num of leading / trailing bits clear + bool CheckVarXFirstBitsAreZero = InitShift == 0; + + for (Use &U : VarX->uses()) { + auto *UI = dyn_cast(U.getUser()); + if (!UI) + continue; + + BasicBlock *CurBB = UI->getParent(); + BranchInst *BI = dyn_cast(CurBB->getTerminator()); + if (!BI) + continue; + + if (auto *Cmp = dyn_cast(UI)) { + // CheckVarXIsNotZero + if (BasicBlock *BB = matchConditionRetNonZeroBlock(BI)) { + BasicBlockEdge BBE(CurBB, BB); + if (DT->dominates(BBE, LoopEntry)) + CheckVarXIsNotZero = true; + } + + // CheckVarXFirstBitsAreZero + // Detect %cmp = icmp sgt i32 %x, -1 as alternative to + // %and = and i32 %x, + // %cmp = icmp eq i32 %and, 0 + if (!CheckVarXFirstBitsAreZero && + IntrinID == Intrinsic::ctlz && + Cmp->getPredicate() == CmpInst::ICMP_SGT) + if (ConstantInt *CI = dyn_cast(UI->getOperand(1))) + if (CI->isMinusOne()) + CheckVarXFirstBitsAreZero = true; + } else if (!CheckVarXFirstBitsAreZero && + UI->getOpcode() == Instruction::And) { + // CheckVarXFirstBitsAreZero + // Check for pattern: + // %and = and i32 %x, + // %cmp = icmp eq i32 %and, 0 + + ConstantInt *Mask = dyn_cast(UI->getOperand(1)); + if (!Mask) + continue; + + // Check the constant value of Mask such that AT LEAST leading / trailing + // InitShift number of bits are set, and the rest of bits are clear. + + // TheMask = Mask + 1 + APInt TheMask = Mask->getValue(); + if (IntrinID == Intrinsic::ctlz) // Check high bits of Mask if ctlz + TheMask = TheMask.reverseBits(); + TheMask = TheMask + 1; + + if (TheMask.isPowerOf2() && TheMask.countTrailingZeros() >= InitShift) + if (BasicBlock *BB = matchConditionRetNonZeroBlock(BI, true)) { + BasicBlockEdge BBE(CurBB, BB); + if (DT->dominates(BBE, LoopEntry) && + cast(BI->getCondition())->getOperand(0) == UI) + CheckVarXFirstBitsAreZero = true; + } + } + } + + if (!CheckVarXIsNotZero || !CheckVarXFirstBitsAreZero) + return false; + + return true; +} + /// Return true if the idiom is detected in the loop. /// /// Additionally: @@ -1305,39 +1665,47 @@ /// /// loop-exit: /// \endcode -static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, - Instruction *&CntInst, PHINode *&CntPhi, - Instruction *&DefX) { +static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, + Intrinsic::ID &IntrinID, Value *&InitX, + Instruction *&CntInst, PHINode *&CntPhi, + Instruction *&DefX) { BasicBlock *LoopEntry; Value *VarX = nullptr; DefX = nullptr; - PhiX = nullptr; CntInst = nullptr; CntPhi = nullptr; LoopEntry = *(CurLoop->block_begin()); // step 1: Check if the loop-back branch is in desirable form. - if (Value *T = matchCondition( + if (Value *T = matchConditionRetVar( dyn_cast(LoopEntry->getTerminator()), LoopEntry)) DefX = dyn_cast(T); else return false; - // step 2: detect instructions corresponding to "x.next = x >> 1" - if (!DefX || (DefX->getOpcode() != Instruction::AShr && - DefX->getOpcode() != Instruction::LShr)) + // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1" + if (!DefX || !DefX->isShift()) return false; + IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz : + Intrinsic::ctlz; ConstantInt *Shft = dyn_cast(DefX->getOperand(1)); if (!Shft || !Shft->isOne()) return false; VarX = DefX->getOperand(0); // step 3: Check the recurrence of variable X - PhiX = getRecurrenceVar(VarX, DefX, LoopEntry); + PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry); if (!PhiX) return false; + InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader()); + + // Make sure the initial value can't be negative otherwise the ashr in the + // loop might never reach zero which would make the loop infinite. + if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL)) + return false; + // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 // TODO: We can skip the step. If loop trip count is known (CTLZ), // then all uses of "cnt.next" could be optimized to the trip count @@ -1369,18 +1737,37 @@ return true; } -/// Recognize CTLZ idiom in a non-countable loop and convert the loop -/// to countable (with CTLZ trip count). -/// If CTLZ inserted as a new trip count returns true; otherwise, returns false. -bool LoopIdiomRecognize::recognizeAndInsertCTLZ() { +/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop +/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new +/// trip count returns true; otherwise, returns false. +/// For either, two kinds of idioms are detected: ShiftUntilZero or BitScan. +bool LoopIdiomRecognize::recognizeAndInsertFFS() { // Give up if the loop has multiple blocks or multiple backedges. if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) return false; - Instruction *CntInst, *DefX; - PHINode *CntPhi, *PhiX; - if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX)) - return false; + Intrinsic::ID IntrinID; + Value *InitX; + Instruction *DefX = nullptr; + PHINode *CntPhi = nullptr; + Instruction *CntInst = nullptr; + // Help decide if transformation is profitable. For ShiftUntilZero idiom, + // this is always 6. For BitScan idiom, this could be 6,7 or 8. + size_t IdiomCanonicalSize = 6; + // For BitScan idiom only + uint64_t InitShift; + + bool FoundBitScanIdiom = false; + if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, + CntInst, CntPhi, DefX)) { + if (!detectBitScanIdiom(CurLoop, DT, SE, IntrinID, InitShift, InitX, + CntInst, CntPhi, IdiomCanonicalSize)) + return false; + else + FoundBitScanIdiom = true; + } + + // FoundBitScanIdiom is false if detect ShiftUntilZero idiom bool IsCntPhiUsedOutsideLoop = false; for (User *U : CntPhi->users()) @@ -1402,35 +1789,32 @@ // For some CPUs result of CTLZ(X) intrinsic is undefined // when X is 0. If we can not guarantee X != 0, we need to check this // when expand. - bool ZeroCheck = false; + bool ZeroCheck = FoundBitScanIdiom; + // It is safe to assume Preheader exist as it was checked in // parent function RunOnLoop. BasicBlock *PH = CurLoop->getLoopPreheader(); - Value *InitX = PhiX->getIncomingValueForBlock(PH); - - // Make sure the initial value can't be negative otherwise the ashr in the - // loop might never reach zero which would make the loop infinite. - if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, *DL)) - return false; // If we are using the count instruction outside the loop, make sure we // have a zero check as a precondition. Without the check the loop would run // one iteration for before any check of the input value. This means 0 and 1 // would have identical behavior in the original loop and thus - if (!IsCntPhiUsedOutsideLoop) { + if (!FoundBitScanIdiom && !IsCntPhiUsedOutsideLoop) { auto *PreCondBB = PH->getSinglePredecessor(); if (!PreCondBB) return false; auto *PreCondBI = dyn_cast(PreCondBB->getTerminator()); if (!PreCondBI) return false; - if (matchCondition(PreCondBI, PH) != InitX) + if (matchConditionRetVar(PreCondBI, PH) != InitX) return false; ZeroCheck = true; } - // Check if CTLZ intrinsic is profitable. Assume it is always profitable - // if we delete the loop (the loop has only 6 instructions): + // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always + // profitable if we delete the loop. + + // For ShiftUntilZero idiom, the loop has only 6 instructions: // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] // %shr = ashr %n.addr.0, 1 @@ -1441,14 +1825,14 @@ const Value *Args[] = {InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext()) : ConstantInt::getFalse(InitX->getContext())}; - if (CurLoop->getHeader()->size() != 6 && - TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) > - TargetTransformInfo::TCC_Basic) + if (CurLoop->getHeader()->size() != IdiomCanonicalSize && + TTI->getIntrinsicCost(IntrinID, InitX->getType(), Args) > + TargetTransformInfo::TCC_Basic) return false; - transformLoopToCountable(PH, CntInst, CntPhi, InitX, DefX, - DefX->getDebugLoc(), ZeroCheck, - IsCntPhiUsedOutsideLoop); + const DebugLoc &XDL = DefX ? DefX->getDebugLoc() : CntInst->getDebugLoc(); + transformLoopToCountable(IntrinID, PH, InitShift, CntInst, CntPhi, InitX, + DefX, XDL, ZeroCheck, IsCntPhiUsedOutsideLoop); return true; } @@ -1515,20 +1899,21 @@ return CI; } -static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val, - const DebugLoc &DL, bool ZeroCheck) { +static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, + const DebugLoc &DL, bool ZeroCheck, + Intrinsic::ID IID) { Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()}; Type *Tys[] = {Val->getType()}; Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); - Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys); + Value *Func = Intrinsic::getDeclaration(M, IID, Tys); CallInst *CI = IRBuilder.CreateCall(Func, Ops); CI->setDebugLoc(DL); return CI; } -/// Transform the following loop: +/// Transform the following loop (Using CTLZ, CTTZ is similar): /// loop: /// CntPhi = PHI [Cnt0, CntInst] /// PhiX = PHI [InitX, DefX] @@ -1539,11 +1924,19 @@ /// Use(CntPhi) or Use(CntInst) /// /// Into: -/// If CntPhi used outside the loop: -/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1) -/// Count = CountPrev + 1 -/// else -/// Count = BitWidth(InitX) - CTLZ(InitX) +/// (ShiftUntilZero): +/// If CntPhi used outside the loop: +/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1) +/// Count = CountPrev + 1 +/// else +/// Count = BitWidth(InitX) - CTLZ(InitX) +/// +/// (BitScan): +/// If CntPhi used outside the loop: +/// CountPrev = CTLZ(InitX) - InitShift +/// Count = CountPrev + 1 +/// else +/// Count = CTLZ(InitX) - InitShift + 1 /// loop: /// CntPhi = PHI [Cnt0, CntInst] /// PhiX = PHI [InitX, DefX] @@ -1560,49 +1953,61 @@ /// If LOOP_BODY is empty the loop will be deleted. /// If CntInst and DefX are not used in LOOP_BODY they will be removed. void LoopIdiomRecognize::transformLoopToCountable( - BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX, - Instruction *DefX, const DebugLoc &DL, bool ZeroCheck, - bool IsCntPhiUsedOutsideLoop) { + Intrinsic::ID IntrinID, BasicBlock *Preheader, uint64_t InitShift, + Instruction *CntInst, PHINode *CntPhi, Value *InitX, Instruction *DefX, + const DebugLoc &DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { + const bool FoundBitScanIdiom = !DefX; BranchInst *PreheaderBr = cast(Preheader->getTerminator()); - // Step 1: Insert the CTLZ instruction at the end of the preheader block - // Count = BitWidth - CTLZ(InitX); - // If there are uses of CntPhi create: - // CountPrev = BitWidth - CTLZ(InitX >> 1); + // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block IRBuilder<> Builder(PreheaderBr); Builder.SetCurrentDebugLocation(DL); - Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext; - - if (IsCntPhiUsedOutsideLoop) { - if (DefX->getOpcode() == Instruction::AShr) - InitXNext = - Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1)); - else if (DefX->getOpcode() == Instruction::LShr) - InitXNext = - Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1)); - else - llvm_unreachable("Unexpected opcode!"); - } else - InitXNext = InitX; - CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck); - Count = Builder.CreateSub( - ConstantInt::get(CTLZ->getType(), - CTLZ->getType()->getIntegerBitWidth()), - CTLZ); - if (IsCntPhiUsedOutsideLoop) { - CountPrev = Count; - Count = Builder.CreateAdd( - CountPrev, - ConstantInt::get(CountPrev->getType(), 1)); + Value *FFS, *Count, *CountPrev, *NewCount, *InitXNext; + + if (FoundBitScanIdiom) { + FFS = createFFSIntrinsic(Builder, InitX, DL, ZeroCheck, IntrinID); + Count = Builder.CreateSub(FFS, + ConstantInt::get(FFS->getType(), InitShift)); + if (IsCntPhiUsedOutsideLoop) + CountPrev = Count; + Count = Builder.CreateAdd(Count, + ConstantInt::get(Count->getType(), 1)); + } else { + // Count = BitWidth - CTLZ(InitX); + // If there are uses of CntPhi create: + // CountPrev = BitWidth - CTLZ(InitX >> 1); + if (IsCntPhiUsedOutsideLoop) { + if (DefX->getOpcode() == Instruction::AShr) + InitXNext = + Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1)); + else if (DefX->getOpcode() == Instruction::LShr) + InitXNext = + Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1)); + else if (DefX->getOpcode() == Instruction::Shl) // cttz + InitXNext = + Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1)); + else + llvm_unreachable("Unexpected opcode!"); + } else + InitXNext = InitX; + FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID); + Count = Builder.CreateSub( + ConstantInt::get(FFS->getType(), + FFS->getType()->getIntegerBitWidth()), + FFS); + if (IsCntPhiUsedOutsideLoop) { + CountPrev = Count; + Count = Builder.CreateAdd( + CountPrev, + ConstantInt::get(CountPrev->getType(), 1)); + } } - if (IsCntPhiUsedOutsideLoop) - NewCount = Builder.CreateZExtOrTrunc(CountPrev, - cast(CntInst->getType())); - else - NewCount = Builder.CreateZExtOrTrunc(Count, - cast(CntInst->getType())); - // If the CTLZ counter's initial value is not zero, insert Add Inst. + NewCount = Builder.CreateZExtOrTrunc( + IsCntPhiUsedOutsideLoop ? CountPrev : Count, + cast(CntInst->getType())); + + // If the counter's initial value is not zero, insert Add Inst. Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader); ConstantInt *InitConst = dyn_cast(CntInitVal); if (!InitConst || !InitConst->isZero()) @@ -1638,8 +2043,7 @@ LbCond->setOperand(1, ConstantInt::get(Ty, 0)); // Step 3: All the references to the original counter outside - // the loop are replaced with the NewCount -- the value returned from - // __builtin_ctlz(x). + // the loop are replaced with the NewCount if (IsCntPhiUsedOutsideLoop) CntPhi->replaceUsesOutsideBlock(NewCount, Body); else Index: test/Transforms/LoopIdiom/X86/ctlz.ll =================================================================== --- test/Transforms/LoopIdiom/X86/ctlz.ll +++ test/Transforms/LoopIdiom/X86/ctlz.ll @@ -1,6 +1,361 @@ ; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=core-avx2 < %s -S | FileCheck -check-prefix=LZCNT --check-prefix=ALL %s ; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=corei7 < %s -S | FileCheck -check-prefix=NOLZCNT --check-prefix=ALL %s +; int ctlz_bitscan_shl(int x) { +; int count = 0; +; if (!x) return sizeof(x)*8; +; unsigned shift = sizeof(x)*8; +; while ((x & (1<< --shift)) == 0) { +; count++; +; } +; return count; +; } +; +; ALL-LABEL: @ctlz_bitscan_shl +; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %x, i1 true) +; ALL-NEXT: %1 = sub i32 %0, 0 +; ALL-NEXT: %2 = add i32 %1, 1 +; ALL-NEXT: %3 = add i32 %1, 10 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_shl(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + br label %while.cond + +while.cond: ; preds = %while.cond.preheader, %while.cond + %count.0 = phi i32 [ %inc, %while.cond ], [ 10, %while.cond.preheader ] + %shift.0 = phi i32 [ %dec, %while.cond ], [ 32, %while.cond.preheader ] + %dec = add i32 %shift.0, -1 + %shl = shl i32 1, %dec + %and = and i32 %shl, %x + %cmp = icmp eq i32 %and, 0 + %inc = add nuw nsw i32 %count.0, 1 + br i1 %cmp, label %while.cond, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.cond + %count.0.lcssa = phi i32 [ %count.0, %while.cond ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %entry + %retval.0 = phi i32 [ 32, %entry ], [ %count.0.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; int ctlz_bitscan_lshr(int x) { +; int count = 0; +; if (!x) return sizeof(x)*8; +; unsigned m = 0x80000000; +; int i = 0; +; while ((x & (m >> i)) == 0) { +; ++i; +; count++; +; } +; return count; +; } +; +; ALL-LABEL: @ctlz_bitscan_lshr +; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %x, i1 true) +; ALL-NEXT: %1 = sub i32 %0, 1 +; ALL-NEXT: %2 = add i32 %1, 1 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_lshr(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %cmp9 = icmp sgt i32 %x, -1 + br i1 %cmp9, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.011 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %i.011, 1 + %shr = lshr i32 1073741824, %i.011 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc1.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc1.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; ALL-LABEL: @ctlz_bitscan_ashr +; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %x, i1 true) +; ALL-NEXT: %1 = sub i32 %0, 1 +; ALL-NEXT: %2 = add i32 %1, 1 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_ashr(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %cmp9 = icmp sgt i32 %x, -1 + br i1 %cmp9, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.011 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %i.011, 1 + %shr = ashr i32 1073741824, %i.011 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc1.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc1.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; For AShr, left operand can not be sign mask. +; +; ALL-LABEL: @ctlz_bitscan_ashr_bad_mask +; ALL-NOT: @llvm.ctlz +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_ashr_bad_mask(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %cmp9 = icmp sgt i32 %x, -1 + br i1 %cmp9, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.011 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %i.011, 1 + %shr = ashr i32 -2147483648, %i.011 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc1.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc1.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; Mask iterate on itself. +; int ctlz_bitscan_lshr_phi(int x) { +; int count = 0; +; if (!x) return sizeof(x)*8; +; unsigned mask = 0x1 << (sizeof(x)*8-1); +; while ((x & mask) == 0) { +; count++; +; mask >>= 1; +; } +; return count; +; } +; +; ALL-LABEL: @ctlz_bitscan_lshr_phi +; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %x, i1 true) +; ALL-NEXT: %1 = sub i32 %0, 1 +; ALL-NEXT: %2 = add i32 %1, 1 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_lshr_phi(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %cmp6 = icmp sgt i32 %x, -1 + br i1 %cmp6, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %mask.08 = phi i32 [ %shr, %while.body ], [ -2147483648, %while.body.preheader ] + %count.07 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %count.07, 1 + %shr = lshr i32 %mask.08, 1 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; ALL-LABEL: @ctlz_bitscan_lshr_phi_bad_mask +; ALL-NOT: @llvm.ctlz +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_lshr_phi_bad_mask(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %cmp6 = icmp sgt i32 %x, -1 + br i1 %cmp6, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %mask.08 = phi i32 [ %shr, %while.body ], [ -2147483648, %while.body.preheader ] + %count.07 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %count.07, 1 + %shr = ashr i32 %mask.08, 1 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; Mask iterate on itself, starting from midpoint of %x +; +; ALL-LABEL: @ctlz_bitscan_mid +; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %x, i1 true) +; ALL-NEXT: %1 = sub i32 %0, 17 +; ALL-NEXT: %2 = add i32 %1, 1 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_mid(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %and1 = and i32 %x, 4294950912 + %cmp6 = icmp eq i32 %and1, 0 + br i1 %cmp6, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %mask.08 = phi i32 [ %shr, %while.body ], [ 32768, %while.body.preheader ] + %count.07 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %count.07, 1 + %shr = lshr i32 %mask.08, 1 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; Mask iterate on itself, starting from midpoint of %x. However, block +; %while.cond.preheader only prove high 12 bits are zero, instead of 16 bits +; required by 65536. No ctlz pattern here. +; +; ALL-LABEL: @ctlz_bitscan_mid_first_bits_check_fail +; ALL-NOT: @llvm.ctlz +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_mid_first_bits_check_fail(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %and1 = and i32 %x, 4293918720 + %cmp6 = icmp eq i32 %and1, 0 + br i1 %cmp6, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %mask.08 = phi i32 [ %shr, %while.body ], [ 65536, %while.body.preheader ] + %count.07 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %count.07, 1 + %shr = lshr i32 %mask.08, 1 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; %x could be zero. No ctlz pattern here. +; +; ALL-LABEL: @ctlz_bitscan_lshr_zero_check_fail +; ALL-NOT: @llvm.ctlz +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @ctlz_bitscan_lshr_zero_check_fail(i32 %x) { +entry: + br label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %cmp9 = icmp sgt i32 %x, -1 + br i1 %cmp9, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.011 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %i.011, 1 + %shr = lshr i32 1073741824, %i.011 + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc1.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 0, %while.cond.preheader ], [ %inc1.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + ; Recognize CTLZ builtin pattern. ; Here we'll just convert loop to countable, ; so do not insert builtin if CPU do not support CTLZ @@ -16,7 +371,7 @@ ; return i; ; } ; -; LZCNT: entry +; LZCNT-LABEL: @ctlz_and_other ; LZCNT: %0 = call i32 @llvm.ctlz.i32(i32 %shr8, i1 true) ; LZCNT-NEXT: %1 = sub i32 32, %0 ; LZCNT-NEXT: %2 = zext i32 %1 to i64 @@ -25,8 +380,8 @@ ; LZCNT: %i.0.lcssa = phi i32 [ 0, %entry ], [ %4, %while.end.loopexit ] ; LZCNT: ret i32 %i.0.lcssa -; NOLZCNT: entry -; NOLZCNT-NOT: @llvm.ctlz +; NOLZCNT-LABEL: @ctlz_and_other +; NOLZCNT-NOT: @llvm.ctlz ; Function Attrs: norecurse nounwind uwtable define i32 @ctlz_and_other(i32 %n, i8* nocapture %a) { @@ -71,7 +426,7 @@ ; ; int ctlz_zero_check(int n) ; { -; n = n >= 0 ? n : -n; +; n = n > 0 ? n : -n; ; int i = 0; ; while(n) { ; n >>= 1; @@ -80,7 +435,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz_zero_check ; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %abs_n, i1 true) ; ALL-NEXT: %1 = sub i32 32, %0 ; ALL: %inc.lcssa = phi i32 [ %1, %while.body ] @@ -129,7 +484,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz_zero_check_lshr ; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %n, i1 true) ; ALL-NEXT: %1 = sub i32 32, %0 ; ALL: %inc.lcssa = phi i32 [ %1, %while.body ] @@ -175,7 +530,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz ; ALL: %0 = ashr i32 %abs_n, 1 ; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) ; ALL-NEXT: %2 = sub i32 32, %1 @@ -216,7 +571,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz_lshr ; ALL: %0 = lshr i32 %n, 1 ; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) ; ALL-NEXT: %2 = sub i32 32, %1 @@ -255,7 +610,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz_add ; ALL: %0 = ashr i32 %abs_n, 1 ; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) ; ALL-NEXT: %2 = sub i32 32, %1 @@ -297,7 +652,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz_add_lshr ; ALL: %0 = lshr i32 %n, 1 ; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) ; ALL-NEXT: %2 = sub i32 32, %1 @@ -339,7 +694,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz_sext ; ALL: %0 = ashr i32 %abs_n, 1 ; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) ; ALL-NEXT: %2 = sub i32 32, %1 @@ -381,7 +736,7 @@ ; return i; ; } ; -; ALL: entry +; ALL-LABEL: @ctlz_sext_lshr ; ALL: %0 = lshr i32 %n, 1 ; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) ; ALL-NEXT: %2 = sub i32 32, %1 Index: test/Transforms/LoopIdiom/X86/cttz.ll =================================================================== --- /dev/null +++ test/Transforms/LoopIdiom/X86/cttz.ll @@ -0,0 +1,227 @@ +; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=core-avx2 < %s -S | FileCheck --check-prefix=ALL %s +; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=corei7 < %s -S | FileCheck --check-prefix=ALL %s + +; int cttz_bitscan_shl(int x) { +; int count = 0; +; if (!x) return sizeof(x)*8; +; int i = 0; +; while (((x >> i) & 0x1) == 0) { +; i++; +; count++; +; } +; return count; +; } +; +; ALL-LABEL: @cttz_bitscan_shl +; ALL: %1 = call i32 @llvm.cttz.i32(i32 %x, i1 true) +; ALL-NEXT: %2 = sub i32 %1, 1 +; ALL-NEXT: %3 = add i32 %2, 1 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @cttz_bitscan_shl(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %0 = and i32 %x, 1 + %cmp8 = icmp eq i32 %0, 0 + br i1 %cmp8, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.010 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %inc = add nuw nsw i32 %i.010, 1 + %1 = shl i32 2, %i.010 + %2 = and i32 %1, %x + %cmp = icmp eq i32 %2, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc1.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc1.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; int cttz_bitscan_shl_phi(int x) { +; int count = 0; +; if (!x) return sizeof(x)*8; +; unsigned i = 1; +; while ((x & i) == 0) { +; i <<= 1; +; count++; +; } +; return count; +; } +; +; ALL-LABEL: @cttz_bitscan_shl_phi +; ALL: %0 = call i32 @llvm.cttz.i32(i32 %x, i1 true) +; ALL-NEXT: %1 = sub i32 %0, 1 +; ALL-NEXT: %2 = add i32 %1, 1 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @cttz_bitscan_shl_phi(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %and6 = and i32 %x, 1 + %cmp7 = icmp eq i32 %and6, 0 + br i1 %cmp7, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.09 = phi i32 [ %shl, %while.body ], [ 1, %while.body.preheader ] + %count.08 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %shl = shl i32 %i.09, 1 + %inc = add nuw nsw i32 %count.08, 1 + %and = and i32 %shl, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; int cttz_bitscan_lshr(int x) { +; int count = 0; +; if (!x) return sizeof(x)*8; +; unsigned m = 0x80000000; +; int i = 31; +; while (((m >> i) & x) == 0) { +; --i; +; count++; +; } +; return count; +; } +; +; ALL-LABEL: @cttz_bitscan_lshr +; ALL: %0 = call i32 @llvm.cttz.i32(i32 %x, i1 true) +; ALL-NEXT: %1 = sub i32 %0, 1 +; ALL-NEXT: %2 = add i32 %1, 1 +; +; Function Attrs: norecurse nounwind readnone ssp uwtable +define i32 @cttz_bitscan_lshr(i32 %x) { +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cleanup, label %while.cond.preheader + +while.cond.preheader: ; preds = %entry + %and7 = and i32 %x, 1 + %cmp8 = icmp eq i32 %and7, 0 + br i1 %cmp8, label %while.body.preheader, label %cleanup + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.010 = phi i32 [ %dec, %while.body ], [ 31, %while.body.preheader ] + %count.09 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %dec = add nsw i32 %i.010, -1 + %inc = add nuw nsw i32 %count.09, 1 + %shr = lshr i32 -2147483648, %dec + %and = and i32 %shr, %x + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %while.body, label %cleanup.loopexit + +cleanup.loopexit: ; preds = %while.body + %inc.lcssa = phi i32 [ %inc, %while.body ] + br label %cleanup + +cleanup: ; preds = %cleanup.loopexit, %while.cond.preheader, %entry + %retval.0 = phi i32 [ 32, %entry ], [ 0, %while.cond.preheader ], [ %inc.lcssa, %cleanup.loopexit ] + ret i32 %retval.0 +} + +; Recognize CTTZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int cttz_zero_check(int n) +; { +; int i = 0; +; while(n) { +; n <<= 1; +; i++; +; } +; return i; +; } +; +; ALL-LABEL: @cttz_zero_check +; ALL: %0 = call i32 @llvm.cttz.i32(i32 %n, i1 true) +; ALL-NEXT: %1 = sub i32 32, %0 +; +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @cttz_zero_check(i32 %n) { +entry: + %tobool4 = icmp eq i32 %n, 0 + br i1 %tobool4, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %n.addr.05 = phi i32 [ %shl, %while.body ], [ %n, %while.body.preheader ] + %shl = shl i32 %n.addr.05, 1 + %inc = add nsw i32 %i.06, 1 + %tobool = icmp eq i32 %shl, 0 + br i1 %tobool, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ] + ret i32 %i.0.lcssa +} + +; Recognize CTTZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int cttz(int n) +; { +; int i = 0; +; while(n <<= 1) { +; i++; +; } +; return i; +; } +; +; ALL-LABEL: @cttz +; ALL: %0 = shl i32 %n, 1 +; ALL-NEXT: %1 = call i32 @llvm.cttz.i32(i32 %0, i1 false) +; ALL-NEXT: %2 = sub i32 32, %1 +; ALL-NEXT: %3 = add i32 %2, 1 +; +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @cttz(i32 %n) { +entry: + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %n.addr.0 = phi i32 [ %n, %entry ], [ %shl, %while.cond ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ] + %shl = shl i32 %n.addr.0, 1 + %tobool = icmp eq i32 %shl, 0 + %inc = add nsw i32 %i.0, 1 + br i1 %tobool, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret i32 %i.0 +} +