Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -422,6 +422,8 @@ } bool foldLoadStoreIntoMemOperand(SDNode *Node); + + bool matchBEXTRFromAnd(SDNode *Node); }; } @@ -2123,6 +2125,90 @@ return true; } +// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. +bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) { + MVT NVT = Node->getSimpleValueType(0); + SDLoc dl(Node); + + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + if (!Subtarget->hasBMI() && !Subtarget->hasTBM()) + return false; + + // Must have a shift right. + if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) + return false; + + // Shift can't have additional users. + if (!N0->hasOneUse()) + return false; + + // Only supported for 32 and 64 bits. + if (NVT != MVT::i32 && NVT != MVT::i64) + return false; + + // Shift amount and RHS of and must be constant. + ConstantSDNode *MaskCst = dyn_cast(N1); + ConstantSDNode *ShiftCst = dyn_cast(N0->getOperand(1)); + if (!MaskCst || !ShiftCst) + return false; + + // And RHS must be a mask. + uint64_t Mask = MaskCst->getZExtValue(); + if (!isMask_64(Mask)) + return false; + + uint64_t Shift = ShiftCst->getZExtValue(); + uint64_t MaskSize = countPopulation(Mask); + + // Don't interfere with something that can be handled by extracting AH. + // TODO: If we are able to fold a load, BEXTR might still be better than AH. + if (Shift == 8 && MaskSize == 8) + return false; + + // Make sure we are only using bits that were in the original value, not + // shifted in. + if (Shift + MaskSize > NVT.getSizeInBits()) + return false; + + SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); + unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; + unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; + + // BMI requires the immediate to placed in a register. + if (!Subtarget->hasTBM()) { + ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; + MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; + SDNode *Move = CurDAG->getMachineNode(X86::MOV32ri, dl, NVT, New); + New = SDValue(Move, 0); + } + + MachineSDNode *NewNode; + SDValue Input = N0->getOperand(0); + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (tryFoldLoad(Node, Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) }; + SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); + NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + // Update the chain. + ReplaceUses(N1.getValue(1), SDValue(NewNode, 1)); + // Record the mem-refs + LoadSDNode *LoadNode = cast(Input); + if (LoadNode) { + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = LoadNode->getMemOperand(); + NewNode->setMemRefs(MemOp, MemOp + 1); + } + } else { + NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New); + } + + ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); + CurDAG->RemoveDeadNode(Node); + return true; +} + void X86DAGToDAGISel::Select(SDNode *Node) { MVT NVT = Node->getSimpleValueType(0); unsigned Opc, MOpc; @@ -2176,8 +2262,14 @@ } case ISD::AND: + // Try to match BEXTR/BEXTRI instruction. + if (matchBEXTRFromAnd(Node)) + return; + + LLVM_FALLTHROUGH; case ISD::OR: case ISD::XOR: { + // For operations of the form (x << C1) op C2, check if we can use a smaller // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. SDValue N0 = Node->getOperand(0); Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -346,9 +346,6 @@ ADD, SUB, ADC, SBB, SMUL, INC, DEC, OR, XOR, AND, - // Bit field extract. - BEXTR, - // LOW, HI, FLAGS = umul LHS, RHS. UMUL, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -24605,7 +24605,6 @@ case X86ISD::OR: return "X86ISD::OR"; case X86ISD::XOR: return "X86ISD::XOR"; case X86ISD::AND: return "X86ISD::AND"; - case X86ISD::BEXTR: return "X86ISD::BEXTR"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::MOVMSK: return "X86ISD::MOVMSK"; case X86ISD::PTEST: return "X86ISD::PTEST"; @@ -32196,9 +32195,6 @@ return ShiftRight; EVT VT = N->getValueType(0); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDLoc DL(N); // Attempt to recursively combine a bitmask AND with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { @@ -32211,29 +32207,6 @@ return SDValue(); // This routine will use CombineTo to replace N. } - // Create BEXTR instructions - // BEXTR is ((X >> imm) & (2**size-1)) - if (VT != MVT::i32 && VT != MVT::i64) - return SDValue(); - - if (!Subtarget.hasBMI() && !Subtarget.hasTBM()) - return SDValue(); - if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL) - return SDValue(); - - ConstantSDNode *MaskNode = dyn_cast(N1); - ConstantSDNode *ShiftNode = dyn_cast(N0.getOperand(1)); - if (MaskNode && ShiftNode) { - uint64_t Mask = MaskNode->getZExtValue(); - uint64_t Shift = ShiftNode->getZExtValue(); - if (isMask_64(Mask)) { - uint64_t MaskSize = countPopulation(Mask); - if (Shift + MaskSize <= VT.getSizeInBits()) - return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), - DAG.getConstant(Shift | (MaskSize << 8), DL, - VT)); - } - } return SDValue(); } Index: lib/Target/X86/X86InstrInfo.td =================================================================== --- lib/Target/X86/X86InstrInfo.td +++ lib/Target/X86/X86InstrInfo.td @@ -271,8 +271,6 @@ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; -def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>; - def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA, @@ -2438,17 +2436,6 @@ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; } // HasBMI2 -let Predicates = [HasBMI] in { - def : Pat<(X86bextr GR32:$src1, GR32:$src2), - (BEXTR32rr GR32:$src1, GR32:$src2)>; - def : Pat<(X86bextr (loadi32 addr:$src1), GR32:$src2), - (BEXTR32rm addr:$src1, GR32:$src2)>; - def : Pat<(X86bextr GR64:$src1, GR64:$src2), - (BEXTR64rr GR64:$src1, GR64:$src2)>; - def : Pat<(X86bextr (loadi64 addr:$src1), GR64:$src2), - (BEXTR64rm addr:$src1, GR64:$src2)>; -} // HasBMI - multiclass bmi_pdep_pext { @@ -2650,15 +2637,6 @@ //===----------------------------------------------------------------------===// let Predicates = [HasTBM] in { - def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)), - (BEXTRI32ri GR32:$src1, imm:$src2)>; - def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)), - (BEXTRI32mi addr:$src1, imm:$src2)>; - def : Pat<(X86bextr GR64:$src1, i64immSExt32:$src2), - (BEXTRI64ri GR64:$src1, i64immSExt32:$src2)>; - def : Pat<(X86bextr (loadi64 addr:$src1), i64immSExt32:$src2), - (BEXTRI64mi addr:$src1, i64immSExt32:$src2)>; - // FIXME: patterns for the load versions are not implemented def : Pat<(and GR32:$src, (add GR32:$src, 1)), (BLCFILL32rr GR32:$src)>; Index: test/CodeGen/X86/bmi.ll =================================================================== --- test/CodeGen/X86/bmi.ll +++ test/CodeGen/X86/bmi.ll @@ -311,6 +311,18 @@ ret i32 %2 } +; Make sure we still use AH subreg trick to extract 15:8 +define i32 @bextr32_subreg(i32 %x) uwtable ssp { +; CHECK-LABEL: bextr32_subreg: +; CHECK: # BB#0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movzbl %ah, %eax # NOREX +; CHECK-NEXT: retq + %1 = lshr i32 %x, 8 + %2 = and i32 %1, 255 + ret i32 %2 +} + define i32 @bextr32b_load(i32* %x) uwtable ssp { ; CHECK-LABEL: bextr32b_load: ; CHECK: # BB#0: @@ -357,6 +369,18 @@ ret i64 %2 } +; Make sure we still use the AH subreg trick to extract 15:8 +define i64 @bextr64_subreg(i64 %x) uwtable ssp { +; CHECK-LABEL: bextr64_subreg: +; CHECK: # BB#0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movzbl %ah, %eax # NOREX +; CHECK-NEXT: retq + %1 = lshr i64 %x, 8 + %2 = and i64 %1, 255 + ret i64 %2 +} + define i64 @bextr64b_load(i64* %x) { ; CHECK-LABEL: bextr64b_load: ; CHECK: # BB#0: Index: test/CodeGen/X86/tbm_patterns.ll =================================================================== --- test/CodeGen/X86/tbm_patterns.ll +++ test/CodeGen/X86/tbm_patterns.ll @@ -13,6 +13,18 @@ ret i32 %t1 } +; Make sure we still use AH subreg trick for extracting bits 15:8 +define i32 @test_x86_tbm_bextri_u32_subreg(i32 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_bextri_u32_subreg: +; CHECK: # BB#0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movzbl %ah, %eax # NOREX +; CHECK-NEXT: retq + %t0 = lshr i32 %a, 8 + %t1 = and i32 %t0, 255 + ret i32 %t1 +} + define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind { ; CHECK-LABEL: test_x86_tbm_bextri_u32_m: ; CHECK: # BB#0: @@ -40,7 +52,8 @@ define i32 @test_x86_tbm_bextri_u32_z2(i32 %a, i32 %b, i32 %c) nounwind { ; CHECK-LABEL: test_x86_tbm_bextri_u32_z2: ; CHECK: # BB#0: -; CHECK-NEXT: bextr $3076, %edi, %eax # imm = 0xC04 +; CHECK-NEXT: shrl $4, %edi +; CHECK-NEXT: testw $4095, %di # imm = 0xFFF ; CHECK-NEXT: cmovnel %edx, %esi ; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: retq @@ -61,6 +74,18 @@ ret i64 %t1 } +; Make sure we still use AH subreg trick for extracting bits 15:8 +define i64 @test_x86_tbm_bextri_u64_subreg(i64 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_bextri_u64_subreg: +; CHECK: # BB#0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movzbl %ah, %eax # NOREX +; CHECK-NEXT: retq + %t0 = lshr i64 %a, 8 + %t1 = and i64 %t0, 255 + ret i64 %t1 +} + define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind { ; CHECK-LABEL: test_x86_tbm_bextri_u64_m: ; CHECK: # BB#0: @@ -88,7 +113,8 @@ define i64 @test_x86_tbm_bextri_u64_z2(i64 %a, i64 %b, i64 %c) nounwind { ; CHECK-LABEL: test_x86_tbm_bextri_u64_z2: ; CHECK: # BB#0: -; CHECK-NEXT: bextr $3076, %edi, %eax # imm = 0xC04 +; CHECK-NEXT: shrl $4, %edi +; CHECK-NEXT: testw $4095, %di # imm = 0xFFF ; CHECK-NEXT: cmovneq %rdx, %rsi ; CHECK-NEXT: movq %rsi, %rax ; CHECK-NEXT: retq