Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -400,6 +400,10 @@ def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true", "Support movdir64b instruction">; +def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true", + "Indicates that the BEXTR instruction is implemented as a single uop " + "with good throughput.">; + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// @@ -987,6 +991,7 @@ FeatureSlowSHLD, FeatureLAHFSAHF, FeatureFast15ByteNOP, + FeatureFastBEXTR, FeatureFastPartialYMMorZMMWrite ]>; @@ -1042,6 +1047,7 @@ FeatureSlowSHLD, FeatureLAHFSAHF, FeatureFast11ByteNOP, + FeatureFastBEXTR, FeatureMacroFusion ]>; @@ -1074,6 +1080,7 @@ FeatureFSGSBase, FeatureLAHFSAHF, FeatureFast11ByteNOP, + FeatureFastBEXTR, FeatureMacroFusion ]>; @@ -1105,6 +1112,7 @@ FeatureSlowSHLD, FeatureFSGSBase, FeatureLAHFSAHF, + FeatureFastBEXTR, FeatureFast11ByteNOP, FeatureMWAITX, FeatureMacroFusion @@ -1130,6 +1138,7 @@ FeatureFastLZCNT, FeatureLAHFSAHF, FeatureLZCNT, + FeatureFastBEXTR, FeatureFast15ByteNOP, FeatureMacroFusion, FeatureMMX, Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2590,7 +2590,14 @@ SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); - if (!Subtarget->hasBMI() && !Subtarget->hasTBM()) + // If we have TBM we can use an immediate for the control. If we have BMI + // we should only do this if the BEXTR instruction is implemented well. + // Otherwise moving the control into a register makes this more costly. + // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM + // hoisting the move immediate would make it worthwhile with a less optimal + // BEXTR? + if (!Subtarget->hasTBM() && + !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR())) return false; // Must have a shift right. Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -385,6 +385,9 @@ /// Processor supports PCONFIG instruction bool HasPCONFIG = false; + /// Processor has a single uop BEXTR implementation. + bool HasFastBEXTR = false; + /// Use a retpoline thunk rather than indirect calls to block speculative /// execution. bool UseRetpolineIndirectCalls = false; @@ -629,6 +632,7 @@ bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } bool hasFastLZCNT() const { return HasFastLZCNT; } bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } + bool hasFastBEXTR() const { return HasFastBEXTR; } bool hasMacroFusion() const { return HasMacroFusion; } bool hasERMSB() const { return HasERMSB; } bool hasSlowDivide32() const { return HasSlowDivide32; } Index: test/CodeGen/X86/bmi-x86_64.ll =================================================================== --- test/CodeGen/X86/bmi-x86_64.ll +++ test/CodeGen/X86/bmi-x86_64.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=CHECK,BMI1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=CHECK,BMI2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=CHECK,BEXTR-SLOW,BMI1,BMI1-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=CHECK,BEXTR-SLOW,BMI2,BMI2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+fast-bextr | FileCheck %s --check-prefixes=CHECK,BEXTR-FAST,BMI1,BMI1-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2,+fast-bextr | FileCheck %s --check-prefixes=CHECK,BEXTR-FAST,BMI2,BMI2-FAST declare i64 @llvm.x86.bmi.bextr.64(i64, i64) @@ -14,11 +16,18 @@ } define i64 @bextr64b(i64 %x) uwtable ssp { -; CHECK-LABEL: bextr64b: -; CHECK: # %bb.0: -; CHECK-NEXT: movl $3076, %eax # imm = 0xC04 -; CHECK-NEXT: bextrl %eax, %edi, %eax -; CHECK-NEXT: retq +; BEXTR-SLOW-LABEL: bextr64b: +; BEXTR-SLOW: # %bb.0: +; BEXTR-SLOW-NEXT: movq %rdi, %rax +; BEXTR-SLOW-NEXT: shrl $4, %eax +; BEXTR-SLOW-NEXT: andl $4095, %eax # imm = 0xFFF +; BEXTR-SLOW-NEXT: retq +; +; BEXTR-FAST-LABEL: bextr64b: +; BEXTR-FAST: # %bb.0: +; BEXTR-FAST-NEXT: movl $3076, %eax # imm = 0xC04 +; BEXTR-FAST-NEXT: bextrl %eax, %edi, %eax +; BEXTR-FAST-NEXT: retq %1 = lshr i64 %x, 4 %2 = and i64 %1, 4095 ret i64 %2 @@ -37,11 +46,18 @@ } define i64 @bextr64b_load(i64* %x) { -; CHECK-LABEL: bextr64b_load: -; CHECK: # %bb.0: -; CHECK-NEXT: movl $3076, %eax # imm = 0xC04 -; CHECK-NEXT: bextrl %eax, (%rdi), %eax -; CHECK-NEXT: retq +; BEXTR-SLOW-LABEL: bextr64b_load: +; BEXTR-SLOW: # %bb.0: +; BEXTR-SLOW-NEXT: movl (%rdi), %eax +; BEXTR-SLOW-NEXT: shrl $4, %eax +; BEXTR-SLOW-NEXT: andl $4095, %eax # imm = 0xFFF +; BEXTR-SLOW-NEXT: retq +; +; BEXTR-FAST-LABEL: bextr64b_load: +; BEXTR-FAST: # %bb.0: +; BEXTR-FAST-NEXT: movl $3076, %eax # imm = 0xC04 +; BEXTR-FAST-NEXT: bextrl %eax, (%rdi), %eax +; BEXTR-FAST-NEXT: retq %1 = load i64, i64* %x, align 8 %2 = lshr i64 %1, 4 %3 = and i64 %2, 4095 @@ -61,11 +77,25 @@ } define i64 @bextr64d(i64 %a) { -; CHECK-LABEL: bextr64d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl $8450, %eax # imm = 0x2102 -; CHECK-NEXT: bextrq %rax, %rdi, %rax -; CHECK-NEXT: retq +; BMI1-SLOW-LABEL: bextr64d: +; BMI1-SLOW: # %bb.0: # %entry +; BMI1-SLOW-NEXT: shrq $2, %rdi +; BMI1-SLOW-NEXT: movl $8448, %eax # imm = 0x2100 +; BMI1-SLOW-NEXT: bextrq %rax, %rdi, %rax +; BMI1-SLOW-NEXT: retq +; +; BMI2-SLOW-LABEL: bextr64d: +; BMI2-SLOW: # %bb.0: # %entry +; BMI2-SLOW-NEXT: shrq $2, %rdi +; BMI2-SLOW-NEXT: movb $33, %al +; BMI2-SLOW-NEXT: bzhiq %rax, %rdi, %rax +; BMI2-SLOW-NEXT: retq +; +; BEXTR-FAST-LABEL: bextr64d: +; BEXTR-FAST: # %bb.0: # %entry +; BEXTR-FAST-NEXT: movl $8450, %eax # imm = 0x2102 +; BEXTR-FAST-NEXT: bextrq %rax, %rdi, %rax +; BEXTR-FAST-NEXT: retq entry: %shr = lshr i64 %a, 2 %and = and i64 %shr, 8589934591 Index: test/CodeGen/X86/bmi.ll =================================================================== --- test/CodeGen/X86/bmi.ll +++ test/CodeGen/X86/bmi.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=CHECK,X86,BMI1,X86-BMI1 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=CHECK,X86,BMI2,X86-BMI2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=CHECK,X64,BMI1,X64-BMI1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=CHECK,X64,BMI2,X64-BMI2 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=CHECK,X86,X86-SLOW-BEXTR +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=CHECK,X86,X86-SLOW-BEXTR +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=CHECK,X64,X64-SLOW-BEXTR +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=CHECK,X64,X64-SLOW-BEXTR +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+fast-bextr | FileCheck %s --check-prefixes=CHECK,X86,X86-FAST-BEXTR +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+fast-bextr | FileCheck %s --check-prefixes=CHECK,X64,X64-FAST-BEXTR define i32 @andn32(i32 %x, i32 %y) { ; X86-LABEL: andn32: @@ -342,17 +344,56 @@ } define i32 @bextr32b(i32 %x) uwtable ssp { -; X86-LABEL: bextr32b: -; X86: # %bb.0: -; X86-NEXT: movl $3076, %eax # imm = 0xC04 -; X86-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax -; X86-NEXT: retl -; -; X64-LABEL: bextr32b: -; X64: # %bb.0: -; X64-NEXT: movl $3076, %eax # imm = 0xC04 -; X64-NEXT: bextrl %eax, %edi, %eax -; X64-NEXT: retq +; X86-BMI1-SLOW-LABEL: bextr32b: +; X86-BMI1-SLOW: # %bb.0: +; X86-BMI1-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1-SLOW-NEXT: shrl $4, %eax +; X86-BMI1-SLOW-NEXT: andl $4095, %eax # imm = 0xFFF +; X86-BMI1-SLOW-NEXT: retl +; +; X64-BMI1-SLOW-LABEL: bextr32b: +; X64-BMI1-SLOW: # %bb.0: +; X64-BMI1-SLOW-NEXT: movl %edi, %eax +; X64-BMI1-SLOW-NEXT: shrl $4, %eax +; X64-BMI1-SLOW-NEXT: andl $4095, %eax # imm = 0xFFF +; X64-BMI1-SLOW-NEXT: retq +; +; X86-BMI1-FAST-LABEL: bextr32b: +; X86-BMI1-FAST: # %bb.0: +; X86-BMI1-FAST-NEXT: movl $3076, %eax # imm = 0xC04 +; X86-BMI1-FAST-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax +; X86-BMI1-FAST-NEXT: retl +; +; X64-BMI1-FAST-LABEL: bextr32b: +; X64-BMI1-FAST: # %bb.0: +; X64-BMI1-FAST-NEXT: movl $3076, %eax # imm = 0xC04 +; X64-BMI1-FAST-NEXT: bextrl %eax, %edi, %eax +; X64-BMI1-FAST-NEXT: retq +; X86-SLOW-BEXTR-LABEL: bextr32b: +; X86-SLOW-BEXTR: # %bb.0: +; X86-SLOW-BEXTR-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-BEXTR-NEXT: shrl $4, %eax +; X86-SLOW-BEXTR-NEXT: andl $4095, %eax # imm = 0xFFF +; X86-SLOW-BEXTR-NEXT: retl +; +; X64-SLOW-BEXTR-LABEL: bextr32b: +; X64-SLOW-BEXTR: # %bb.0: +; X64-SLOW-BEXTR-NEXT: movl %edi, %eax +; X64-SLOW-BEXTR-NEXT: shrl $4, %eax +; X64-SLOW-BEXTR-NEXT: andl $4095, %eax # imm = 0xFFF +; X64-SLOW-BEXTR-NEXT: retq +; +; X86-FAST-BEXTR-LABEL: bextr32b: +; X86-FAST-BEXTR: # %bb.0: +; X86-FAST-BEXTR-NEXT: movl $3076, %eax # imm = 0xC04 +; X86-FAST-BEXTR-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax +; X86-FAST-BEXTR-NEXT: retl +; +; X64-FAST-BEXTR-LABEL: bextr32b: +; X64-FAST-BEXTR: # %bb.0: +; X64-FAST-BEXTR-NEXT: movl $3076, %eax # imm = 0xC04 +; X64-FAST-BEXTR-NEXT: bextrl %eax, %edi, %eax +; X64-FAST-BEXTR-NEXT: retq %1 = lshr i32 %x, 4 %2 = and i32 %1, 4095 ret i32 %2 @@ -376,18 +417,60 @@ } define i32 @bextr32b_load(i32* %x) uwtable ssp { -; X86-LABEL: bextr32b_load: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl $3076, %ecx # imm = 0xC04 -; X86-NEXT: bextrl %ecx, (%eax), %eax -; X86-NEXT: retl -; -; X64-LABEL: bextr32b_load: -; X64: # %bb.0: -; X64-NEXT: movl $3076, %eax # imm = 0xC04 -; X64-NEXT: bextrl %eax, (%rdi), %eax -; X64-NEXT: retq +; X86-BMI1-SLOW-LABEL: bextr32b_load: +; X86-BMI1-SLOW: # %bb.0: +; X86-BMI1-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1-SLOW-NEXT: movl (%eax), %eax +; X86-BMI1-SLOW-NEXT: shrl $4, %eax +; X86-BMI1-SLOW-NEXT: andl $4095, %eax # imm = 0xFFF +; X86-BMI1-SLOW-NEXT: retl +; +; X64-BMI1-SLOW-LABEL: bextr32b_load: +; X64-BMI1-SLOW: # %bb.0: +; X64-BMI1-SLOW-NEXT: movl (%rdi), %eax +; X64-BMI1-SLOW-NEXT: shrl $4, %eax +; X64-BMI1-SLOW-NEXT: andl $4095, %eax # imm = 0xFFF +; X64-BMI1-SLOW-NEXT: retq +; +; X86-BMI1-FAST-LABEL: bextr32b_load: +; X86-BMI1-FAST: # %bb.0: +; X86-BMI1-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1-FAST-NEXT: movl $3076, %ecx # imm = 0xC04 +; X86-BMI1-FAST-NEXT: bextrl %ecx, (%eax), %eax +; X86-BMI1-FAST-NEXT: retl +; +; X64-BMI1-FAST-LABEL: bextr32b_load: +; X64-BMI1-FAST: # %bb.0: +; X64-BMI1-FAST-NEXT: movl $3076, %eax # imm = 0xC04 +; X64-BMI1-FAST-NEXT: bextrl %eax, (%rdi), %eax +; X64-BMI1-FAST-NEXT: retq +; X86-SLOW-BEXTR-LABEL: bextr32b_load: +; X86-SLOW-BEXTR: # %bb.0: +; X86-SLOW-BEXTR-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-BEXTR-NEXT: movl (%eax), %eax +; X86-SLOW-BEXTR-NEXT: shrl $4, %eax +; X86-SLOW-BEXTR-NEXT: andl $4095, %eax # imm = 0xFFF +; X86-SLOW-BEXTR-NEXT: retl +; +; X64-SLOW-BEXTR-LABEL: bextr32b_load: +; X64-SLOW-BEXTR: # %bb.0: +; X64-SLOW-BEXTR-NEXT: movl (%rdi), %eax +; X64-SLOW-BEXTR-NEXT: shrl $4, %eax +; X64-SLOW-BEXTR-NEXT: andl $4095, %eax # imm = 0xFFF +; X64-SLOW-BEXTR-NEXT: retq +; +; X86-FAST-BEXTR-LABEL: bextr32b_load: +; X86-FAST-BEXTR: # %bb.0: +; X86-FAST-BEXTR-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-BEXTR-NEXT: movl $3076, %ecx # imm = 0xC04 +; X86-FAST-BEXTR-NEXT: bextrl %ecx, (%eax), %eax +; X86-FAST-BEXTR-NEXT: retl +; +; X64-FAST-BEXTR-LABEL: bextr32b_load: +; X64-FAST-BEXTR: # %bb.0: +; X64-FAST-BEXTR-NEXT: movl $3076, %eax # imm = 0xC04 +; X64-FAST-BEXTR-NEXT: bextrl %eax, (%rdi), %eax +; X64-FAST-BEXTR-NEXT: retq %1 = load i32, i32* %x %2 = lshr i32 %1, 4 %3 = and i32 %2, 4095 Index: test/CodeGen/X86/extract-bits.ll =================================================================== --- test/CodeGen/X86/extract-bits.ll +++ test/CodeGen/X86/extract-bits.ll @@ -5653,8 +5653,9 @@ ; ; X86-BMI1NOTBM-LABEL: c0_i32: ; X86-BMI1NOTBM: # %bb.0: -; X86-BMI1NOTBM-NEXT: movl $2579, %eax # imm = 0xA13 -; X86-BMI1NOTBM-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBM-NEXT: shrl $19, %eax +; X86-BMI1NOTBM-NEXT: andl $1023, %eax # imm = 0x3FF ; X86-BMI1NOTBM-NEXT: retl ; ; X86-BMI1TBM-LABEL: c0_i32: @@ -5664,8 +5665,9 @@ ; ; X86-BMI1NOTBMBMI2-LABEL: c0_i32: ; X86-BMI1NOTBMBMI2: # %bb.0: -; X86-BMI1NOTBMBMI2-NEXT: movl $2579, %eax # imm = 0xA13 -; X86-BMI1NOTBMBMI2-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBMBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBMBMI2-NEXT: shrl $19, %eax +; X86-BMI1NOTBMBMI2-NEXT: andl $1023, %eax # imm = 0x3FF ; X86-BMI1NOTBMBMI2-NEXT: retl ; ; X64-NOBMI-LABEL: c0_i32: @@ -5677,8 +5679,9 @@ ; ; X64-BMI1NOTBM-LABEL: c0_i32: ; X64-BMI1NOTBM: # %bb.0: -; X64-BMI1NOTBM-NEXT: movl $2579, %eax # imm = 0xA13 -; X64-BMI1NOTBM-NEXT: bextrl %eax, %edi, %eax +; X64-BMI1NOTBM-NEXT: movl %edi, %eax +; X64-BMI1NOTBM-NEXT: shrl $19, %eax +; X64-BMI1NOTBM-NEXT: andl $1023, %eax # imm = 0x3FF ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1TBM-LABEL: c0_i32: @@ -5688,8 +5691,9 @@ ; ; X64-BMI1NOTBMBMI2-LABEL: c0_i32: ; X64-BMI1NOTBMBMI2: # %bb.0: -; X64-BMI1NOTBMBMI2-NEXT: movl $2579, %eax # imm = 0xA13 -; X64-BMI1NOTBMBMI2-NEXT: bextrl %eax, %edi, %eax +; X64-BMI1NOTBMBMI2-NEXT: movl %edi, %eax +; X64-BMI1NOTBMBMI2-NEXT: shrl $19, %eax +; X64-BMI1NOTBMBMI2-NEXT: andl $1023, %eax # imm = 0x3FF ; X64-BMI1NOTBMBMI2-NEXT: retq %tmp0 = lshr i32 %arg, 19 %tmp1 = and i32 %tmp0, 1023