Index: llvm/trunk/include/llvm/Target/TargetLowering.h =================================================================== --- llvm/trunk/include/llvm/Target/TargetLowering.h +++ llvm/trunk/include/llvm/Target/TargetLowering.h @@ -249,6 +249,16 @@ return true; } + /// \brief Return true if it is cheap to speculate a call to intrinsic cttz. + virtual bool isCheapToSpeculateCttz() const { + return false; + } + + /// \brief Return true if it is cheap to speculate a call to intrinsic ctlz. + virtual bool isCheapToSpeculateCtlz() const { + return false; + } + /// \brief Return if the target supports combining a /// chain like: /// \code Index: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp @@ -3978,6 +3978,119 @@ Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted); } +// See if we can speculate calls to intrinsic cttz/ctlz. +// +// Example: +// entry: +// ... +// %cmp = icmp eq i64 %val, 0 +// br i1 %cmp, label %end.bb, label %then.bb +// +// then.bb: +// %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) +// br label %EndBB +// +// end.bb: +// %cond = phi i64 [ %c, %then.bb ], [ 64, %entry ] +// +// ==> +// +// entry: +// ... +// %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) +// +static bool OptimizeBranchInst(BranchInst *BrInst, const TargetLowering &TLI) { + assert(BrInst->isConditional() && "Expected a conditional branch!"); + BasicBlock *ThenBB = BrInst->getSuccessor(1); + BasicBlock *EndBB = BrInst->getSuccessor(0); + + // See if ThenBB contains only one instruction (excluding the + // terminator and DbgInfoIntrinsic calls). + IntrinsicInst *II = nullptr; + for (BasicBlock::iterator I = ThenBB->begin(), + E = std::prev(ThenBB->end()); I != E; ++I) { + // Skip debug info. + if (isa(I)) + continue; + + if (II) + // Avoid speculating more than one instruction. + return false; + + // See if this is a call to intrinsic cttz/ctlz. + if (match(cast(I), m_Intrinsic())) { + // Avoid speculating expensive intrinsic calls. + if (!TLI.isCheapToSpeculateCttz()) + return false; + } + else if (match(cast(I), m_Intrinsic())) { + // Avoid speculating expensive intrinsic calls. + if (!TLI.isCheapToSpeculateCtlz()) + return false; + } else + return false; + + II = cast(I); + } + + // Look for PHI nodes with 'II' as the incoming value from 'ThenBB'. + BasicBlock *EntryBB = BrInst->getParent(); + for (BasicBlock::iterator I = EndBB->begin(); + PHINode *PN = dyn_cast(I); ++I) { + Value *ThenV = PN->getIncomingValueForBlock(ThenBB); + Value *OrigV = PN->getIncomingValueForBlock(EntryBB); + + if (!OrigV || ThenV != II) + return false; + + if (ConstantInt *CInt = dyn_cast(OrigV)) { + unsigned BitWidth = ThenV->getType()->getIntegerBitWidth(); + + // Don't try to simplify this phi node if 'ThenV' is a cttz/ctlz + // intrinsic call, but 'OrigV' is not equal to the 'size-of' in bits + // of the value in input to the cttz/ctlz. + if (CInt->getValue() != BitWidth) + return false; + + // Hoist the call to cttz/ctlz from ThenBB into EntryBB. + EntryBB->getInstList().splice(BrInst, ThenBB->getInstList(), + ThenBB->begin(), std::prev(ThenBB->end())); + + // Update PN setting ThenV as the incoming value from both 'EntryBB' + // and 'ThenBB'. Eventually, method 'OptimizeInst' will fold this + // phi node if all the incoming values are the same. + PN->setIncomingValue(PN->getBasicBlockIndex(EntryBB), ThenV); + PN->setIncomingValue(PN->getBasicBlockIndex(ThenBB), ThenV); + + // Clear the 'undef on zero' flag of the cttz/ctlz intrinsic call. + if (cast(II->getArgOperand(1))->isOne()) { + Type *Ty = II->getArgOperand(0)->getType(); + Value *Args[] = { II->getArgOperand(0), + ConstantInt::getFalse(II->getContext()) }; + Module *M = EntryBB->getParent()->getParent(); + Value *IF = Intrinsic::getDeclaration(M, II->getIntrinsicID(), Ty); + IRBuilder<> Builder(BrInst); + Instruction *NewI = Builder.CreateCall(IF, Args); + + // Replace the old call to cttz/ctlz. + II->replaceAllUsesWith(NewI); + II->eraseFromParent(); + } + + // Update BrInst condition so that the branch to EndBB is always taken. + // Later on, method 'ConstantFoldTerminator' will simplify this branch + // replacing it with a direct branch to 'EndBB'. + // As a side effect, CodeGenPrepare will attempt to simplify the control + // flow graph by deleting basic block 'ThenBB' and merging 'EntryBB' into + // 'EndBB' (calling method 'EliminateFallThrough'). + BrInst->setCondition(ConstantInt::getTrue(BrInst->getContext())); + return true; + } + } + + return false; +} + /// Some targets can do store(extractelement) with one instruction. /// Try to push the extractelement towards the stores when the target /// has this feature and this is profitable. @@ -4130,6 +4243,34 @@ if (isa(I)) return OptimizeExtractElementInst(I); + if (BranchInst *BI = dyn_cast(I)) { + if (TLI && BI->isConditional() && BI->getCondition()->hasOneUse()) { + // Check if the branch condition compares a value agaist zero. + if (ICmpInst *ICI = dyn_cast(BI->getCondition())) { + if (ICI->getPredicate() == ICmpInst::ICMP_EQ && + match(ICI->getOperand(1), m_Zero())) { + BasicBlock *ThenBB = BI->getSuccessor(1); + BasicBlock *EndBB = BI->getSuccessor(0); + + // Check if ThenBB is only reachable from this basic block; also, + // check if EndBB has more than one predecessor. + if (ThenBB->getSinglePredecessor() && + !EndBB->getSinglePredecessor()) { + TerminatorInst *TI = ThenBB->getTerminator(); + + if (TI->getNumSuccessors() == 1 && TI->getSuccessor(0) == EndBB && + // Try to speculate calls to intrinsic cttz/ctlz from 'ThenBB'. + OptimizeBranchInst(BI, *TLI)) { + ModifiedDT = true; + return true; + } + } + } + } + } + return false; + } + return false; } Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -633,6 +633,10 @@ /// This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; + bool isCheapToSpeculateCttz() const override; + + bool isCheapToSpeculateCtlz() const override; + /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -3882,6 +3882,16 @@ return (Index == 0 || Index == ResVT.getVectorNumElements()); } +bool X86TargetLowering::isCheapToSpeculateCttz() const { + // Don't try to speculate cttz if we can't directly use TZCNT. + return Subtarget->hasBMI(); +} + +bool X86TargetLowering::isCheapToSpeculateCtlz() const { + // Don't try to speculate ctlz if we can't directly use LZCNT. + return Subtarget->hasLZCNT(); +} + /// isUndefOrInRange - Return true if Val is undef or if its value falls within /// the specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { Index: llvm/trunk/test/CodeGen/X86/cttz-ctlz.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/cttz-ctlz.ll +++ llvm/trunk/test/CodeGen/X86/cttz-ctlz.ll @@ -0,0 +1,250 @@ +; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -mattr=+bmi < %s | FileCheck %s --check-prefix=ALL --check-prefix=BMI +; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -mattr=+lzcnt < %s | FileCheck %s --check-prefix=ALL --check-prefix=LZCNT +; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=ALL --check-prefix=GENERIC + + +define i64 @test1(i64 %A) { +; ALL-LABEL: @test1( +; LZCNT: [[CTLZ:%[A-Za-z0-9]+]] = call i64 @llvm.ctlz.i64(i64 %A, i1 false) +; LZCNT-NEXT: ret i64 [[CTLZ]] +; BMI: icmp eq i64 %A, 0 +; BMI: call i64 @llvm.ctlz.i64(i64 %A, i1 true) +; GENERIC: icmp eq i64 %A, 0 +; GENERIC: call i64 @llvm.ctlz.i64(i64 %A, i1 true) +entry: + %tobool = icmp eq i64 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ] + ret i64 %cond +} + + +define i32 @test2(i32 %A) { +; ALL-LABEL: @test2( +; LZCNT: [[CTLZ:%[A-Za-z0-9]+]] = call i32 @llvm.ctlz.i32(i32 %A, i1 false) +; LZCNT-NEXT: ret i32 [[CTLZ]] +; BMI: icmp eq i32 %A, 0 +; BMI: call i32 @llvm.ctlz.i32(i32 %A, i1 true) +; GENERIC: icmp eq i32 %A, 0 +; GENERIC: call i32 @llvm.ctlz.i32(i32 %A, i1 true) +entry: + %tobool = icmp eq i32 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %0, %cond.true ], [ 32, %entry ] + ret i32 %cond +} + + +define signext i16 @test3(i16 signext %A) { +; ALL-LABEL: @test3( +; LZCNT: [[CTLZ:%[A-Za-z0-9]+]] = call i16 @llvm.ctlz.i16(i16 %A, i1 false) +; LZCNT-NEXT: ret i16 [[CTLZ]] +; BMI: icmp eq i16 %A, 0 +; BMI: call i16 @llvm.ctlz.i16(i16 %A, i1 true) +; GENERIC: icmp eq i16 %A, 0 +; GENERIC: call i16 @llvm.ctlz.i16(i16 %A, i1 true) +entry: + %tobool = icmp eq i16 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %0, %cond.true ], [ 16, %entry ] + ret i16 %cond +} + + +define i64 @test1b(i64 %A) { +; ALL-LABEL: @test1b( +; LZCNT: icmp eq i64 %A, 0 +; LZCNT: call i64 @llvm.cttz.i64(i64 %A, i1 true) +; BMI: [[CTTZ:%[A-Za-z0-9]+]] = call i64 @llvm.cttz.i64(i64 %A, i1 false) +; BMI-NEXT: ret i64 [[CTTZ]] +; GENERIC: icmp eq i64 %A, 0 +; GENERIC: call i64 @llvm.cttz.i64(i64 %A, i1 true) +entry: + %tobool = icmp eq i64 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ] + ret i64 %cond +} + + +define i32 @test2b(i32 %A) { +; ALL-LABEL: @test2b( +; LZCNT: icmp eq i32 %A, 0 +; LZCNT: call i32 @llvm.cttz.i32(i32 %A, i1 true) +; BMI: [[CTTZ:%[A-Za-z0-9]+]] = call i32 @llvm.cttz.i32(i32 %A, i1 false) +; BMI-NEXT: ret i32 [[CTTZ]] +; GENERIC: icmp eq i32 %A, 0 +; GENERIC: call i32 @llvm.cttz.i32(i32 %A, i1 true) +entry: + %tobool = icmp eq i32 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.cttz.i32(i32 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %0, %cond.true ], [ 32, %entry ] + ret i32 %cond +} + + +define signext i16 @test3b(i16 signext %A) { +; ALL-LABEL: @test3b( +; LZCNT: icmp eq i16 %A, 0 +; LZCNT: call i16 @llvm.cttz.i16(i16 %A, i1 true) +; BMI: [[CTTZ:%[A-Za-z0-9]+]] = call i16 @llvm.cttz.i16(i16 %A, i1 false) +; BMI-NEXT: ret i16 [[CTTZ]] +; GENERIC: icmp eq i16 %A, 0 +; GENERIC: call i16 @llvm.cttz.i16(i16 %A, i1 true) +entry: + %tobool = icmp eq i16 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %0, %cond.true ], [ 16, %entry ] + ret i16 %cond +} + + +define i64 @test1c(i64 %A) { +; ALL-LABEL: @test1c( +; ALL: icmp eq i64 %A, 0 +; ALL: call i64 @llvm.ctlz.i64(i64 %A, i1 true) +entry: + %tobool = icmp eq i64 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %0, %cond.true ], [ 63, %entry ] + ret i64 %cond +} + +define i32 @test2c(i32 %A) { +; ALL-LABEL: @test2c( +; ALL: icmp eq i32 %A, 0 +; ALL: call i32 @llvm.ctlz.i32(i32 %A, i1 true) +entry: + %tobool = icmp eq i32 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %0, %cond.true ], [ 31, %entry ] + ret i32 %cond +} + + +define signext i16 @test3c(i16 signext %A) { +; ALL-LABEL: @test3c( +; ALL: icmp eq i16 %A, 0 +; ALL: call i16 @llvm.ctlz.i16(i16 %A, i1 true) +entry: + %tobool = icmp eq i16 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %0, %cond.true ], [ 15, %entry ] + ret i16 %cond +} + + +define i64 @test1d(i64 %A) { +; ALL-LABEL: @test1d( +; ALL: icmp eq i64 %A, 0 +; ALL: call i64 @llvm.cttz.i64(i64 %A, i1 true) +entry: + %tobool = icmp eq i64 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %0, %cond.true ], [ 63, %entry ] + ret i64 %cond +} + + +define i32 @test2d(i32 %A) { +; ALL-LABEL: @test2d( +; ALL: icmp eq i32 %A, 0 +; ALL: call i32 @llvm.cttz.i32(i32 %A, i1 true) +entry: + %tobool = icmp eq i32 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.cttz.i32(i32 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %0, %cond.true ], [ 31, %entry ] + ret i32 %cond +} + + +define signext i16 @test3d(i16 signext %A) { +; ALL-LABEL: @test3d( +; ALL: icmp eq i16 %A, 0 +; ALL: call i16 @llvm.cttz.i16(i16 %A, i1 true) +entry: + %tobool = icmp eq i16 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %0, %cond.true ], [ 15, %entry ] + ret i16 %cond +} + + +declare i64 @llvm.ctlz.i64(i64, i1) +declare i32 @llvm.ctlz.i32(i32, i1) +declare i16 @llvm.ctlz.i16(i16, i1) +declare i64 @llvm.cttz.i64(i64, i1) +declare i32 @llvm.cttz.i32(i32, i1) +declare i16 @llvm.cttz.i16(i16, i1)