Index: llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -243,6 +243,8 @@ void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var); bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz + bool recognizeAndReplaceCTZ(); + void transformLoopToCTZ(Loop *CurLoop, PHINode *CntPhi, Value *Val); void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var, Instruction *DefX, @@ -1573,7 +1575,8 @@ << CurLoop->getHeader()->getName() << "\n"); return recognizePopcount() || recognizeAndInsertFFS() || - recognizeShiftUntilBitTest() || recognizeShiftUntilZero(); + recognizeShiftUntilBitTest() || recognizeShiftUntilZero() || + recognizeAndReplaceCTZ(); } /// Check if the given conditional branch is based on the comparison between @@ -1847,6 +1850,181 @@ return true; } +/// Return true if the idiom is detected in the loop. +/// +/// Additionally: +/// 1) \p CntPhi is set to the corresponding phi node. +/// 2) \p Val is set to the value whose trailing zeros are being counted. +/// +/// The core idiom we are trying to detect is: +/// \code +/// x = init-val; +/// land.rhs: +/// i = phi (0, i.next); +/// count = phi(0, count.next); +/// shl = 1 << i; +/// and = shl & x; // Val +/// if (and != 0) +/// goto while.exit; +/// else +/// goto while.body; +/// while.body: +/// i.next = i + 1; +/// count.next = count + 1; +/// if (i.next < 32) +/// goto land.rhs; +/// while.exit: +/// count.exit = phi(count.next, count); // CntPhi +/// \endcode +static bool detectCTZIdiom(Loop *CurLoop, BasicBlock *LoopHeader, + BasicBlock *LoopExit, PHINode *&CntPhi, + Value *&Val) { + BasicBlock *LoopLatch = CurLoop->getLoopLatch(); + + // Step 1: Get the phi after the loop and check everything about it. + CntPhi = dyn_cast(&LoopExit->front()); + + if (!CntPhi || CntPhi->getNumIncomingValues() != 2) + return false; + + auto *AddIncValue0 = dyn_cast(CntPhi->getIncomingValue(0)); + + // Check that the first incoming value is an add which is inside LoopBody. + if (!AddIncValue0 || AddIncValue0->getOpcode() != Instruction::Add || + AddIncValue0->getParent() != LoopLatch) + return false; + + auto *Int = dyn_cast(AddIncValue0->getOperand(1)); + + // Check that this add is a ++. + if (!Int || !Int->isOne()) + return false; + + auto *PhiIntValue1 = dyn_cast(CntPhi->getIncomingValue(1)); + + // Check that the incoming values comes from LoopHeader and LoopBody. + // Check that the second incoming value comes from LoopBody. + if (!PhiIntValue1 || PhiIntValue1->getParent() != LoopHeader) + return false; + + // The phi must have a 0 value as first incoming value. + Int = dyn_cast(PhiIntValue1->getIncomingValue(0)); + if (!Int || !Int->isZero()) + return false; + + // Step 2: Check that the first non phi instruction is a left shift. + Instruction *Shl = LoopHeader->getFirstNonPHI(); + + if (!Shl || Shl->getOpcode() != Instruction::Shl) + return false; + + // The shift is done on 1 with i. + Int = dyn_cast(Shl->getOperand(0)); + if (!Int || !Int->isOne()) + return false; + + // Step 3: + // Check that the instruction after the shift is an and which uses the result + // of the previous shift as one of its operands. + // Its other operand is the x which should be used as CTZ operand. + Instruction *And = Shl->getNextNode(); + if (!And || And->getOpcode() != Instruction::And) + return false; + + // And first operand is the result of the previous Shl. + if (And->getOperand(0) != Shl) + return false; + + // And second operand is Val. + Val = And->getOperand(1); + + return true; +} + +/// Recognizes a count trailing zeros idiom in a non-countable loop. +/// +/// If detected, transforms the relevant code to issue the cttz intrinsic +/// function call, and returns true; otherwise, returns false. +bool LoopIdiomRecognize::recognizeAndReplaceCTZ() { + // The loop should have two blocks: + // 1. A header. + // 2. A body where the counter is incremented. + // Give up if the loop has not 2 blocks or multiple backedges. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2) + return false; + + // It should have a preheader containing nothing but an unconditional branch + // to the loop header. + BasicBlock *PH = CurLoop->getLoopPreheader(); + if (!PH || &PH->front() != PH->getTerminator()) + return false; + auto *EntryBI = dyn_cast(PH->getTerminator()); + if (!EntryBI || EntryBI->isConditional()) + return false; + + // The header counts minimum 5 instructions: + // land.rhs: + // %i.07 = phi i32 [ 0, %entry ], [ %inc, %while.body ] + // %0 = shl nuw i32 1, %i.07 + // %1 = and i32 %0, %x + // %cmp1 = icmp eq i32 %1, 0 + // br i1 %cmp1, label %while.body, label %while.end + const ssize_t IdiomCanonicalSize = 5; + BasicBlock *LoopHeader = CurLoop->getHeader(); + + if (LoopHeader->sizeWithoutDebug() < IdiomCanonicalSize) + return false; + + // The loop should have only one exit block. + BasicBlock *LoopExit = CurLoop->getUniqueExitBlock(); + + if (!LoopExit) + return false; + + PHINode *CntPhi; + Value *Val; + if (!detectCTZIdiom(CurLoop, LoopHeader, LoopExit, CntPhi, Val)) + return false; + + transformLoopToCTZ(CurLoop, CntPhi, Val); + return true; +} + +/// Create a cttz intrinsic instruction ready to be used with is_zero_undef set +/// to true. +static CallInst *createCtzIntrinsic(IRBuilder<> &IRBuilder, Value *Val, + const DebugLoc &DL) { + // The cttz intrinsic takes 2 arguments: + // 1. src to count the trailing zeros from. In this case src is Val. + // 2. is_zero_undef, a boolean, if set cttz returns src type size in bits when + // src is 0 or undef otherwise. For this call, we hardcode true. + Value *Ops[] = {Val, IRBuilder.getTrue()}; + + Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); + Function *Func = + Intrinsic::getDeclaration(M, Intrinsic::cttz, Val->getType()); + CallInst *CI = IRBuilder.CreateCall(Func, Ops); + CI->setDebugLoc(DL); + + return CI; +} + +/// Replace loop with a cttz intrinsic. +void LoopIdiomRecognize::transformLoopToCTZ(Loop *CurLoop, PHINode *CntPhi, + Value *Val) { + // Step 1: Create the cttz intrinsic. + IRBuilder<> Builder(CntPhi); + const DebugLoc &DL = CntPhi->getDebugLoc(); + Instruction *Ctz = createCtzIntrinsic(Builder, Val, DL); + + // Step 2: Replace the phi by Ctz everywhere it appears. + CntPhi->replaceAllUsesWith(Ctz); + CntPhi->eraseFromParent(); + + // Step 3: Forget the loop. + SE->forgetLoop(CurLoop); +} + /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new /// trip count returns true; otherwise, returns false. Index: llvm/test/Transforms/LoopIdiom/cttz.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopIdiom/cttz.ll @@ -0,0 +1,66 @@ +; RUN: opt -loop-idiom < %s -S | FileCheck %s + +; To recognize this pattern: +; int cttz(unsigned int x) { +; unsigned int i = 0; +; while(i < 32 && (((x >> i) & 0x1) == 0)) +; i++; +; return i; +; } +; +; CHECK: entry +; CHECK: llvm.cttz.i32 +; CHECK: ret +define i32 @cttz32(i32 %x) norecurse nounwind readnone uwtable { +entry: + br label %land.rhs + +land.rhs: ; preds = %entry, %while.body + %i.06 = phi i32 [ 0, %entry ], [ %inc, %while.body ] + %0 = shl nuw i32 1, %i.06 + %1 = and i32 %0, %x + %cmp1 = icmp eq i32 %1, 0 + br i1 %cmp1, label %while.body, label %while.end + +while.body: ; preds = %land.rhs + %inc = add nuw nsw i32 %i.06, 1 + %cmp = icmp ult i32 %i.06, 31 + br i1 %cmp, label %land.rhs, label %while.end + +while.end: ; preds = %while.body, %land.rhs + %i.0.lcssa = phi i32 [ %inc, %while.body ], [ %i.06, %land.rhs ] + ret i32 %i.0.lcssa +} + +; To recognize this pattern: +; int cttz(unsigned long x) { +; unsigned long i = 0; +; while(i < 64 && (((x >> i) & 0x1) == 0)) +; i++; +; return i; +; } +; +; CHECK: entry +; CHECK: llvm.cttz.i64 +; CHECK: ret +define i32 @cttz64(i64 %x) nounwind uwtable readnone ssp { +entry: + br label %land.rhs + +land.rhs: ; preds = %entry, %while.body + %i.06 = phi i64 [ 0, %entry ], [ %inc, %while.body ] + %0 = shl nuw i64 1, %i.06 + %1 = and i64 %0, %x + %cmp1 = icmp eq i64 %1, 0 + br i1 %cmp1, label %while.body, label %while.end + +while.body: ; preds = %land.rhs + %inc = add nuw nsw i64 %i.06, 1 + %cmp = icmp ult i64 %i.06, 63 + br i1 %cmp, label %land.rhs, label %while.end + +while.end: ; preds = %while.body, %land.rhs + %i.0.lcssa = phi i64 [ %inc, %while.body ], [ %i.06, %land.rhs ] + %conv = trunc i64 %i.0.lcssa to i32 + ret i32 %conv +} \ No newline at end of file