Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -179,6 +179,9 @@ "Support FS/GS Base instructions">; def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true", "Support LZCNT instruction">; +// On some architectures, such as AMD's Jaguar, LZCNT is fast. +def FeatureFastLZCNT : SubtargetFeature<"fastlzcnt", "HasFastLZCNT", "true", + "LZCNT instructions are fast">; def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true", "Support BMI instructions">; def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true", @@ -631,6 +634,7 @@ FeatureF16C, FeatureMOVBE, FeatureLZCNT, + FeatureFastLZCNT, FeaturePOPCNT, FeatureXSAVE, FeatureXSAVEOPT, Index: lib/Target/X86/X86InstrInfo.td =================================================================== --- lib/Target/X86/X86InstrInfo.td +++ lib/Target/X86/X86InstrInfo.td @@ -834,6 +834,7 @@ def HasF16C : Predicate<"Subtarget->hasF16C()">; def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">; def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">; +def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; def HasBMI : Predicate<"Subtarget->hasBMI()">; def HasBMI2 : Predicate<"Subtarget->hasBMI2()">; def HasVBMI : Predicate<"Subtarget->hasVBMI()">, Index: lib/Target/X86/X86InstrShiftRotate.td =================================================================== --- lib/Target/X86/X86InstrShiftRotate.td +++ lib/Target/X86/X86InstrShiftRotate.td @@ -967,3 +967,83 @@ // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible. } + +let Predicates = [HasLZCNT, HasFastLZCNT] in { + + // Transform comparisons with 0, followed by a zero extend, + // into lzcnt + shift: + // Eg: + // + // test %edi, %eax + // sete %al + // movzbl %eax + // + // into + // + // lzcntl %edi, %eax + // shrl $5, %eax + // + + // Shift by 4 for 16-bits flavor. + def : Pat<(zext (X86setcc X86_COND_E, (X86cmp GR16:$src, (i16 0)))), + (SHR16ri (LZCNT16rr GR16:$src), (i8 4))>; + + // Shift by 5 for 32-bits flavor. + def : Pat<(zext (X86setcc X86_COND_E, (X86cmp GR32:$src, (i32 0)))), + (SHR32ri (LZCNT32rr GR32:$src), (i8 5))>; + + // Shift by 6 for 64-bits flavor. + def : Pat<(zext (X86setcc X86_COND_E, (X86cmp GR64:$src, (i64 0)))), + (SHR64ri (LZCNT64rr GR64:$src), (i8 6))>; + + // Input is 64-bit, result is 32-bit. + def : Pat<(i32 (zext (X86setcc X86_COND_E, (X86cmp GR64:$src, (i64 0))))), + (EXTRACT_SUBREG + (SHR64ri (LZCNT64rr GR64:$src), (i8 6)), + sub_32bit)>; + + // Input is 32-bit, result is 64-bit. + def : Pat<(i64 (zext (X86setcc X86_COND_E, (X86cmp GR32:$src, (i32 0))))), + (SUBREG_TO_REG + (i64 0), + (SHR32ri(LZCNT32rr GR32:$src), (i8 5)), + sub_32bit)>; + + // Transform 2 OR'ed comparisons with 0, followed by a zero extend, + // into lzcnt + shift. + // + // Eg: + // + // testl %edi, %edi + // sete %al + // testl %esi, %esi + // sete %cl + // orb %al, %cl + // movzbl %cl, %eax + // + // into + // + // lzcntl %edi, %ecx + // lzcntl %esi, %eax + // orl %ecx, %eax + // shrl $5, %eax + // + + def : Pat<(zext (or (X86setcc X86_COND_E, (X86cmp GR16:$src1, (i16 0))), + (X86setcc X86_COND_E, (X86cmp GR16:$src2, (i16 0))))), + (SHR16ri (OR16rr (LZCNT16rr GR16:$src1), + (LZCNT16rr GR16:$src2)), + (i8 4))>; + + def : Pat<(zext (or (X86setcc X86_COND_E, (X86cmp GR32:$src1, (i32 0))), + (X86setcc X86_COND_E, (X86cmp GR32:$src2, (i32 0))))), + (SHR32ri (OR32rr (LZCNT32rr GR32:$src1), + (LZCNT32rr GR32:$src2)), + (i8 5))>; + + def : Pat<(zext (or (X86setcc X86_COND_E, (X86cmp GR64:$src1, (i64 0))), + (X86setcc X86_COND_E, (X86cmp GR64:$src2, (i64 0))))), + (SHR64ri (OR64rr (LZCNT64rr GR64:$src1), + (LZCNT64rr GR64:$src2)), + (i8 6))>; +} Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -207,6 +207,9 @@ /// 64-bit divisions and should be used when possible. bool HasSlowDivide64; + /// True if LZCNT instruction is fast. + bool HasFastLZCNT; + /// True if the short functions should be padded to prevent /// a stall when returning too early. bool PadShortFunctions; @@ -414,6 +417,7 @@ bool hasF16C() const { return HasF16C; } bool hasFSGSBase() const { return HasFSGSBase; } bool hasLZCNT() const { return HasLZCNT; } + bool hasFastLZCNT() const { return HasFastLZCNT; } bool hasBMI() const { return HasBMI; } bool hasBMI2() const { return HasBMI2; } bool hasVBMI() const { return HasVBMI; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -254,6 +254,7 @@ HasF16C = false; HasFSGSBase = false; HasLZCNT = false; + HasFastLZCNT = false; HasBMI = false; HasBMI2 = false; HasVBMI = false; Index: test/CodeGen/X86/lzcnt-zext-cmp.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/lzcnt-zext-cmp.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Test patterns which generates lzcnt instructions. +; Eg: zext(setcc(cmp)) -> shr(lzcnt) +; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+lzcnt -mcpu=haswell | FileCheck --check-prefix=NOFASTLZCNT %s + +define i32 @foo1(i32 %a) { +; CHECK-LABEL: foo1: +; CHECK: # BB#0: +; CHECK-NEXT: lzcntl %edi, %eax +; CHECK-NEXT: shrl $5, %eax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: foo1: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: testl %edi, %edi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: movzbl %al, %eax +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i32 %a, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv + +} + +define i64 @foo2(i32 %a) { +; CHECK-LABEL: foo2: +; CHECK: # BB#0: +; CHECK-NEXT: lzcntl %edi, %eax +; CHECK-NEXT: shrl $5, %eax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: foo2: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: testl %edi, %edi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: movzbl %al, %eax +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i32 %a, 0 + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +define i64 @foo3(i64 %a) { +; CHECK-LABEL: foo3: +; CHECK: # BB#0: +; CHECK-NEXT: lzcntq %rdi, %rax +; CHECK-NEXT: shrq $6, %rax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: foo3: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: testq %rdi, %rdi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: movzbl %al, %eax +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +define i32 @foo4(i64 %a) { +; CHECK-LABEL: foo4: +; CHECK: # BB#0: +; CHECK-NEXT: lzcntq %rdi, %rax +; CHECK-NEXT: shrq $6, %rax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: foo4: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: testq %rdi, %rdi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: movzbl %al, %eax +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i32 + ret i32 %conv1 +} + +define i16 @foo5(i16 %a) { +; CHECK-LABEL: foo5: +; CHECK: # BB#0: +; CHECK-NEXT: lzcntw %di, %ax +; CHECK-NEXT: shrw $4, %ax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: foo5: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: testw %di, %di +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: movzbl %al, %eax +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i16 %a, 0 + %conv = zext i1 %cmp to i16 + ret i16 %conv +} + +define i32 @bar1(i32 %a, i32 %b) { +; CHECK-LABEL: bar1: +; CHECK: # BB#0: +; CHECK-NEXT: lzcntl %esi, %ecx +; CHECK-NEXT: lzcntl %edi, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: shrl $5, %eax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: bar1: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: testl %edi, %edi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: testl %esi, %esi +; NOFASTLZCNT-NEXT: sete %cl +; NOFASTLZCNT-NEXT: orb %al, %cl +; NOFASTLZCNT-NEXT: movzbl %cl, %eax +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i32 %a, 0 + %cmp1 = icmp eq i32 %b, 0 + %or = or i1 %cmp, %cmp1 + %lor.ext = zext i1 %or to i32 + ret i32 %lor.ext +} + +define i64 @bar2(i64 %a, i64 %b) { +; CHECK-LABEL: bar2: +; CHECK: # BB#0: +; CHECK-NEXT: lzcntq %rsi, %rcx +; CHECK-NEXT: lzcntq %rdi, %rax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: shrq $6, %rax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: bar2: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: testq %rdi, %rdi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: testq %rsi, %rsi +; NOFASTLZCNT-NEXT: sete %cl +; NOFASTLZCNT-NEXT: orb %al, %cl +; NOFASTLZCNT-NEXT: movzbl %cl, %eax +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i64 %a, 0 + %cmp1 = icmp eq i64 %b, 0 + %or = or i1 %cmp, %cmp1 + %lor.ext = zext i1 %or to i64 + ret i64 %lor.ext +} + +define i16 @bar3(i16 %a, i16 %b) { +; CHECK-LABEL: bar3: +; CHECK: # BB#0: +; CHECK-NEXT: lzcntw %si, %cx +; CHECK-NEXT: lzcntw %di, %ax +; CHECK-NEXT: orw %cx, %ax +; CHECK-NEXT: shrw $4, %ax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: bar3: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: testw %di, %di +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: testw %si, %si +; NOFASTLZCNT-NEXT: sete %cl +; NOFASTLZCNT-NEXT: orb %al, %cl +; NOFASTLZCNT-NEXT: movzbl %cl, %eax +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i16 %a, 0 + %cmp1 = icmp eq i16 %b, 0 + %or = or i1 %cmp, %cmp1 + %lor.ext = zext i1 %or to i16 + ret i16 %lor.ext +}