Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -3058,7 +3058,7 @@ virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const; - SDValue LowerToCtlzSrlPair(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerToCtlzSrlPair(SDValue Op, EVT ExtTy, SelectionDAG &DAG) const; private: SDValue simplifySetCCWithAnd(EVT VT, SDValue N0, SDValue N1, Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3563,11 +3563,13 @@ return CallResult.first; } -SDValue TargetLowering::LowerToCtlzSrlPair(SDValue Op, - SelectionDAG &DAG) const { +llvm::SDValue +llvm::TargetLowering::LowerToCtlzSrlPair(SDValue Op, EVT ExtTy, + SelectionDAG &DAG) const { assert((Op->getOpcode() == ISD::SETCC) && "Input has to be a SETCC node."); if (!isCtlzFast()) return SDValue(); + ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDLoc dl(Op); if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { @@ -3582,7 +3584,10 @@ SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext); SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz, DAG.getConstant(Log2b, dl, MVT::i32)); - return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc); + if (ExtTy.bitsLE(VT)) + return DAG.getNode(ISD::TRUNCATE, dl, ExtTy, Scc); + else + return DAG.getNode(ISD::ZERO_EXTEND, dl, ExtTy, Scc); } } return SDValue(); Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -2362,7 +2362,7 @@ // If we're comparing for equality to zero, expose the fact that this is // implemented as a ctlz/srl pair on ppc, so that the dag combiner can // fold the new nodes. - if (SDValue V = LowerToCtlzSrlPair(Op, DAG)) + if (SDValue V = LowerToCtlzSrlPair(Op, MVT::i32, DAG)) return V; if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -179,6 +179,9 @@ "Support FS/GS Base instructions">; def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true", "Support LZCNT instruction">; +// On some architectures, such as AMD's Jaguar, LZCNT is fast. +def FeatureFastLZCNT : SubtargetFeature<"fastlzcnt", "HasFastLZCNT", "true", + "LZCNT instructions are fast">; def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true", "Support BMI instructions">; def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true", @@ -646,6 +649,7 @@ FeatureF16C, FeatureMOVBE, FeatureLZCNT, + FeatureFastLZCNT, FeaturePOPCNT, FeatureXSAVE, FeatureXSAVEOPT, Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -760,6 +760,8 @@ bool isCheapToSpeculateCtlz() const override; + bool isCtlzFast() const override; + bool hasBitPreservingFPLogic(EVT VT) const override { return VT == MVT::f32 || VT == MVT::f64 || VT.isVector(); } Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -4188,6 +4188,10 @@ return Subtarget.hasLZCNT(); } +bool X86TargetLowering::isCtlzFast() const { + return Subtarget.hasLZCNT() && Subtarget.hasFastLZCNT(); +} + bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { if (!Subtarget.hasBMI()) return false; @@ -30773,6 +30777,33 @@ if (SDValue DivRem8 = getDivRem8(N, DAG)) return DivRem8; + // Patterns with interesting uses do not benefit from srl(ctlz) + // transformation. + auto hasInterestingUses = [](SDNode *N) { + bool HasInterestingUses = false; + for (auto Use : N->uses()) { + if (Use->getOpcode() == ISD::AND || Use->getOpcode() == ISD::OR || + Use->getOpcode() == ISD::XOR || Use->getOpcode() == ISD::ADD || + Use->getOpcode() == ISD::SUB || Use->getOpcode() == ISD::MUL || + Use->getOpcode() == ISD::SDIV || Use->getOpcode() == ISD::UDIV || + Use->getOpcode() == ISD::SREM || Use->getOpcode() == ISD::UREM || + Use->getOpcode() == ISD::SELECT) { + HasInterestingUses = true; + break; + } + } + return HasInterestingUses; + }; + + if (N0->getOpcode() == ISD::SETCC && N0->hasOneUse() && + N0->getOperand(0).getValueType().bitsGE(MVT::i32) && + N->getSimpleValueType(0).bitsGE(MVT::i32) && !DCI.isBeforeLegalize() && + !hasInterestingUses(N)) + if (SDValue V = Subtarget.getTargetLowering()->LowerToCtlzSrlPair( + N0, N->getValueType(0), DAG)) { + return V; + } + return SDValue(); } Index: lib/Target/X86/X86InstrInfo.td =================================================================== --- lib/Target/X86/X86InstrInfo.td +++ lib/Target/X86/X86InstrInfo.td @@ -834,6 +834,7 @@ def HasF16C : Predicate<"Subtarget->hasF16C()">; def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">; def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">; +def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; def HasBMI : Predicate<"Subtarget->hasBMI()">; def HasBMI2 : Predicate<"Subtarget->hasBMI2()">; def HasVBMI : Predicate<"Subtarget->hasVBMI()">, Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -215,6 +215,9 @@ /// 64-bit divisions and should be used when possible. bool HasSlowDivide64; + /// True if LZCNT instruction is fast. + bool HasFastLZCNT; + /// True if the short functions should be padded to prevent /// a stall when returning too early. bool PadShortFunctions; @@ -422,6 +425,7 @@ bool hasF16C() const { return HasF16C; } bool hasFSGSBase() const { return HasFSGSBase; } bool hasLZCNT() const { return HasLZCNT; } + bool hasFastLZCNT() const { return HasFastLZCNT; } bool hasBMI() const { return HasBMI; } bool hasBMI2() const { return HasBMI2; } bool hasVBMI() const { return HasVBMI; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -254,6 +254,7 @@ HasF16C = false; HasFSGSBase = false; HasLZCNT = false; + HasFastLZCNT = false; HasBMI = false; HasBMI2 = false; HasVBMI = false; Index: test/CodeGen/X86/lzcnt-zext-cmp.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/lzcnt-zext-cmp.ll @@ -0,0 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Test patterns which generates lzcnt instructions. +; Eg: zext(setcc(cmp)) -> shr(lzcnt) +; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+lzcnt -mcpu=haswell | FileCheck --check-prefix=NOFASTLZCNT %s + +define i32 @foo1(i32 %a) { +; CHECK-LABEL: foo1: +; CHECK: # BB#0: +; CHECK-NEXT: lzcntl %edi, %eax +; CHECK-NEXT: shrl $5, %eax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: foo1: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: xorl %eax, %eax +; NOFASTLZCNT-NEXT: testl %edi, %edi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i32 %a, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv + +} + +define i64 @foo2(i32 %a) { +; CHECK-LABEL: foo2: +; CHECK: # BB#0: +; CHECK-NEXT: lzcntl %edi, %eax +; CHECK-NEXT: shrl $5, %eax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: foo2: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: xorl %eax, %eax +; NOFASTLZCNT-NEXT: testl %edi, %edi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i32 %a, 0 + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +define i64 @foo3(i64 %a) { +; CHECK-LABEL: foo3: +; CHECK: # BB#0: +; CHECK-NEXT: lzcntq %rdi, %rax +; CHECK-NEXT: shrq $6, %rax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: foo3: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: xorl %eax, %eax +; NOFASTLZCNT-NEXT: testq %rdi, %rdi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +define i32 @foo4(i64 %a) { +; CHECK-LABEL: foo4: +; CHECK: # BB#0: +; CHECK-NEXT: lzcntq %rdi, %rax +; CHECK-NEXT: shrq $6, %rax +; CHECK-NEXT: # kill: %EAX %EAX %RAX +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: foo4: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: xorl %eax, %eax +; NOFASTLZCNT-NEXT: testq %rdi, %rdi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i32 + ret i32 %conv1 +} + +define i16 @foo5(i16 %a) { +; CHECK-LABEL: foo5: +; CHECK: # BB#0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testw %di, %di +; CHECK-NEXT: sete %al +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: foo5: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: xorl %eax, %eax +; NOFASTLZCNT-NEXT: testw %di, %di +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: # kill: %AX %AX %EAX +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i16 %a, 0 + %conv = zext i1 %cmp to i16 + ret i16 %conv +}