diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -3441,10 +3441,16 @@ // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM // hoisting the move immediate would make it worthwhile with a less optimal // BEXTR? - if (!Subtarget->hasTBM() && - !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR())) + bool PreferBEXTR = + Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR()); + if (!PreferBEXTR && !Subtarget->hasBMI2()) return nullptr; + // BZHI, if avaliable, is always fast, unlike BEXTR. But even if we decide + // that we can't use BEXTR, it isn't a sufficient reason to use BZHI. + // We still need a profitability check. + bool BZHIIsProfitable = false; + // Must have a shift right. if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) return nullptr; @@ -3481,23 +3487,50 @@ if (Shift + MaskSize > NVT.getSizeInBits()) return nullptr; - SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); - unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; - unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; + // If the mask is larger than 32-bit then it is warranted to use BZHI. + BZHIIsProfitable |= MaskSize > 32; - // BMI requires the immediate to placed in a register. - if (!Subtarget->hasTBM()) { - ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; - MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; + SDValue Control; + unsigned ROpc, MOpc; + + if (!PreferBEXTR) { + assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then."); + // If we can't make use of BEXTR then we can't fuse shift+mask stages. + // Let's perform the mask first, and apply shift later. Note that we need to + // widen the mask to account for the fact that we'll apply shift afterwards! + Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT); + ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr; + MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm; unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; - New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0); + Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); + } else { + // The 'control' of BEXTR has the pattern of: + // [15...8 bit][ 7...0 bit] location + // [ bit count][ shift] name + // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 + Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); + if (Subtarget->hasTBM()) { + ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; + MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; + } else { + assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then."); + // BMI requires the immediate to placed in a register. + ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; + MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; + unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; + Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); + } } MachineSDNode *NewNode; SDValue Input = N0->getOperand(0); SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { - SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) }; + // Load folding is a sufficient reason to use BZHI. + BZHIIsProfitable |= true; + + SDValue Ops[] = { + Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)}; SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); // Update the chain. @@ -3505,7 +3538,19 @@ // Record the mem-refs CurDAG->setNodeMemRefs(NewNode, {cast(Input)->getMemOperand()}); } else { - NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New); + NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control); + } + + if (!PreferBEXTR) { + // So, is it worth it using BZHI for this pattern? + if (!BZHIIsProfitable) + return nullptr; + + // We still need to apply the shift. + SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT); + unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri; + NewNode = + CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt); } return NewNode; diff --git a/llvm/test/CodeGen/X86/bmi-x86_64.ll b/llvm/test/CodeGen/X86/bmi-x86_64.ll --- a/llvm/test/CodeGen/X86/bmi-x86_64.ll +++ b/llvm/test/CodeGen/X86/bmi-x86_64.ll @@ -46,12 +46,19 @@ } define i64 @bextr64b_load(i64* %x) { -; BEXTR-SLOW-LABEL: bextr64b_load: -; BEXTR-SLOW: # %bb.0: -; BEXTR-SLOW-NEXT: movl (%rdi), %eax -; BEXTR-SLOW-NEXT: shrl $4, %eax -; BEXTR-SLOW-NEXT: andl $4095, %eax # imm = 0xFFF -; BEXTR-SLOW-NEXT: retq +; BMI1-SLOW-LABEL: bextr64b_load: +; BMI1-SLOW: # %bb.0: +; BMI1-SLOW-NEXT: movl (%rdi), %eax +; BMI1-SLOW-NEXT: shrl $4, %eax +; BMI1-SLOW-NEXT: andl $4095, %eax # imm = 0xFFF +; BMI1-SLOW-NEXT: retq +; +; BMI2-SLOW-LABEL: bextr64b_load: +; BMI2-SLOW: # %bb.0: +; BMI2-SLOW-NEXT: movl $16, %eax +; BMI2-SLOW-NEXT: bzhil %eax, (%rdi), %eax +; BMI2-SLOW-NEXT: shrl $4, %eax +; BMI2-SLOW-NEXT: retq ; ; BEXTR-FAST-LABEL: bextr64b_load: ; BEXTR-FAST: # %bb.0: @@ -86,9 +93,9 @@ ; ; BMI2-SLOW-LABEL: bextr64d: ; BMI2-SLOW: # %bb.0: # %entry -; BMI2-SLOW-NEXT: shrq $2, %rdi -; BMI2-SLOW-NEXT: movb $33, %al +; BMI2-SLOW-NEXT: movl $35, %eax ; BMI2-SLOW-NEXT: bzhiq %rax, %rdi, %rax +; BMI2-SLOW-NEXT: shrq $2, %rax ; BMI2-SLOW-NEXT: retq ; ; BEXTR-FAST-LABEL: bextr64d: @@ -113,10 +120,9 @@ ; ; BMI2-SLOW-LABEL: bextr64d_load: ; BMI2-SLOW: # %bb.0: # %entry -; BMI2-SLOW-NEXT: movq (%rdi), %rax +; BMI2-SLOW-NEXT: movl $35, %eax +; BMI2-SLOW-NEXT: bzhiq %rax, (%rdi), %rax ; BMI2-SLOW-NEXT: shrq $2, %rax -; BMI2-SLOW-NEXT: movb $33, %cl -; BMI2-SLOW-NEXT: bzhiq %rcx, %rax, %rax ; BMI2-SLOW-NEXT: retq ; ; BEXTR-FAST-LABEL: bextr64d_load: diff --git a/llvm/test/CodeGen/X86/bmi.ll b/llvm/test/CodeGen/X86/bmi.ll --- a/llvm/test/CodeGen/X86/bmi.ll +++ b/llvm/test/CodeGen/X86/bmi.ll @@ -344,12 +344,12 @@ } define i32 @bextr32b(i32 %x) uwtable ssp { -; X86-SLOW-BEXTR-LABEL: bextr32b: -; X86-SLOW-BEXTR: # %bb.0: -; X86-SLOW-BEXTR-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-BEXTR-NEXT: shrl $4, %eax -; X86-SLOW-BEXTR-NEXT: andl $4095, %eax # imm = 0xFFF -; X86-SLOW-BEXTR-NEXT: retl +; X86-SLOW-BEXTR-BMI2-LABEL: bextr32b: +; X86-SLOW-BEXTR-BMI2: # %bb.0: +; X86-SLOW-BEXTR-BMI2-NEXT: movl $16, %eax +; X86-SLOW-BEXTR-BMI2-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax +; X86-SLOW-BEXTR-BMI2-NEXT: shrl $4, %eax +; X86-SLOW-BEXTR-BMI2-NEXT: retl ; ; X64-SLOW-BEXTR-LABEL: bextr32b: ; X64-SLOW-BEXTR: # %bb.0: @@ -392,20 +392,20 @@ } define i32 @bextr32b_load(i32* %x) uwtable ssp { -; X86-SLOW-BEXTR-LABEL: bextr32b_load: -; X86-SLOW-BEXTR: # %bb.0: -; X86-SLOW-BEXTR-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-BEXTR-NEXT: movl (%eax), %eax -; X86-SLOW-BEXTR-NEXT: shrl $4, %eax -; X86-SLOW-BEXTR-NEXT: andl $4095, %eax # imm = 0xFFF -; X86-SLOW-BEXTR-NEXT: retl +; X86-SLOW-BEXTR-BMI2-LABEL: bextr32b_load: +; X86-SLOW-BEXTR-BMI2: # %bb.0: +; X86-SLOW-BEXTR-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-BEXTR-BMI2-NEXT: movl $16, %ecx +; X86-SLOW-BEXTR-BMI2-NEXT: bzhil %ecx, (%eax), %eax +; X86-SLOW-BEXTR-BMI2-NEXT: shrl $4, %eax +; X86-SLOW-BEXTR-BMI2-NEXT: retl ; -; X64-SLOW-BEXTR-LABEL: bextr32b_load: -; X64-SLOW-BEXTR: # %bb.0: -; X64-SLOW-BEXTR-NEXT: movl (%rdi), %eax -; X64-SLOW-BEXTR-NEXT: shrl $4, %eax -; X64-SLOW-BEXTR-NEXT: andl $4095, %eax # imm = 0xFFF -; X64-SLOW-BEXTR-NEXT: retq +; X64-SLOW-BEXTR-BMI2-LABEL: bextr32b_load: +; X64-SLOW-BEXTR-BMI2: # %bb.0: +; X64-SLOW-BEXTR-BMI2-NEXT: movl $16, %eax +; X64-SLOW-BEXTR-BMI2-NEXT: bzhil %eax, (%rdi), %eax +; X64-SLOW-BEXTR-BMI2-NEXT: shrl $4, %eax +; X64-SLOW-BEXTR-BMI2-NEXT: retq ; ; X86-FAST-BEXTR-LABEL: bextr32b_load: ; X86-FAST-BEXTR: # %bb.0: