Index: lib/Target/X86/X86ISelDAGToDAG.cpp
===================================================================
--- lib/Target/X86/X86ISelDAGToDAG.cpp
+++ lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -422,6 +422,8 @@
     }
 
     bool foldLoadStoreIntoMemOperand(SDNode *Node);
+
+    bool matchBEXTRFromAnd(SDNode *Node);
   };
 }
 
@@ -2123,6 +2125,90 @@
   return true;
 }
 
+// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
+bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) {
+  MVT NVT = Node->getSimpleValueType(0);
+  SDLoc dl(Node);
+
+  SDValue N0 = Node->getOperand(0);
+  SDValue N1 = Node->getOperand(1);
+
+  if (!Subtarget->hasBMI() && !Subtarget->hasTBM())
+    return false;
+
+  // Must have a shift right.
+  if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
+    return false;
+
+  // Shift can't have additional users.
+  if (!N0->hasOneUse())
+    return false;
+
+  // Only supported for 32 and 64 bits.
+  if (NVT != MVT::i32 && NVT != MVT::i64)
+    return false;
+
+  // Shift amount and RHS of and must be constant.
+  ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
+  ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+  if (!MaskCst || !ShiftCst)
+    return false;
+
+  // And RHS must be a mask.
+  uint64_t Mask = MaskCst->getZExtValue();
+  if (!isMask_64(Mask))
+    return false;
+
+  uint64_t Shift = ShiftCst->getZExtValue();
+  uint64_t MaskSize = countPopulation(Mask);
+
+  // Don't interfere with something that can be handled by extracting AH.
+  // TODO: If we are able to fold a load, BEXTR might still be better than AH.
+  if (Shift == 8 && MaskSize == 8)
+    return false;
+
+  // Make sure we are only using bits that were in the original value, not
+  // shifted in.
+  if (Shift + MaskSize > NVT.getSizeInBits())
+    return false;
+
+  SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
+  unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
+  unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+
+  // BMI requires the immediate to placed in a register.
+  if (!Subtarget->hasTBM()) {
+    ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
+    MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+    SDNode *Move = CurDAG->getMachineNode(X86::MOV32ri, dl, NVT, New);
+    New = SDValue(Move, 0);
+  }
+
+  MachineSDNode *NewNode;
+  SDValue Input = N0->getOperand(0);
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (tryFoldLoad(Node, Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
+    SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+    NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+    // Update the chain.
+    ReplaceUses(N1.getValue(1), SDValue(NewNode, 1));
+    // Record the mem-refs
+    LoadSDNode *LoadNode = cast<LoadSDNode>(Input);
+    if (LoadNode) {
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = LoadNode->getMemOperand();
+      NewNode->setMemRefs(MemOp, MemOp + 1);
+    }
+  } else {
+    NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
+  }
+
+  ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
+  CurDAG->RemoveDeadNode(Node);
+  return true;
+}
+
 void X86DAGToDAGISel::Select(SDNode *Node) {
   MVT NVT = Node->getSimpleValueType(0);
   unsigned Opc, MOpc;
@@ -2176,8 +2262,14 @@
   }
 
   case ISD::AND:
+    // Try to match BEXTR/BEXTRI instruction.
+    if (matchBEXTRFromAnd(Node))
+      return;
+
+    LLVM_FALLTHROUGH;
   case ISD::OR:
   case ISD::XOR: {
+
     // For operations of the form (x << C1) op C2, check if we can use a smaller
     // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
     SDValue N0 = Node->getOperand(0);
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -346,9 +346,6 @@
       ADD, SUB, ADC, SBB, SMUL,
       INC, DEC, OR, XOR, AND,
 
-      // Bit field extract.
-      BEXTR,
-
       // LOW, HI, FLAGS = umul LHS, RHS.
       UMUL,
 
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -24605,7 +24605,6 @@
   case X86ISD::OR:                 return "X86ISD::OR";
   case X86ISD::XOR:                return "X86ISD::XOR";
   case X86ISD::AND:                return "X86ISD::AND";
-  case X86ISD::BEXTR:              return "X86ISD::BEXTR";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
@@ -32196,9 +32195,6 @@
     return ShiftRight;
 
   EVT VT = N->getValueType(0);
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDLoc DL(N);
 
   // Attempt to recursively combine a bitmask AND with shuffles.
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
@@ -32211,29 +32207,6 @@
       return SDValue(); // This routine will use CombineTo to replace N.
   }
 
-  // Create BEXTR instructions
-  // BEXTR is ((X >> imm) & (2**size-1))
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
-
-  if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
-    return SDValue();
-  if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
-    return SDValue();
-
-  ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
-  ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
-  if (MaskNode && ShiftNode) {
-    uint64_t Mask = MaskNode->getZExtValue();
-    uint64_t Shift = ShiftNode->getZExtValue();
-    if (isMask_64(Mask)) {
-      uint64_t MaskSize = countPopulation(Mask);
-      if (Shift + MaskSize <= VT.getSizeInBits())
-        return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
-                           DAG.getConstant(Shift | (MaskSize << 8), DL,
-                                           VT));
-    }
-  }
   return SDValue();
 }
 
Index: lib/Target/X86/X86InstrInfo.td
===================================================================
--- lib/Target/X86/X86InstrInfo.td
+++ lib/Target/X86/X86InstrInfo.td
@@ -271,8 +271,6 @@
                           [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
                            SDNPMemOperand]>;
 
-def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
-
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
 def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
@@ -2438,17 +2436,6 @@
               (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
 } // HasBMI2
 
-let Predicates = [HasBMI] in {
-  def : Pat<(X86bextr GR32:$src1, GR32:$src2),
-            (BEXTR32rr GR32:$src1, GR32:$src2)>;
-  def : Pat<(X86bextr (loadi32 addr:$src1), GR32:$src2),
-            (BEXTR32rm addr:$src1, GR32:$src2)>;
-  def : Pat<(X86bextr GR64:$src1, GR64:$src2),
-            (BEXTR64rr GR64:$src1, GR64:$src2)>;
-  def : Pat<(X86bextr (loadi64 addr:$src1), GR64:$src2),
-            (BEXTR64rm addr:$src1, GR64:$src2)>;
-} // HasBMI
-
 multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
                          X86MemOperand x86memop, Intrinsic Int,
                          PatFrag ld_frag> {
@@ -2650,15 +2637,6 @@
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasTBM] in {
-  def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
-            (BEXTRI32ri GR32:$src1, imm:$src2)>;
-  def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
-            (BEXTRI32mi addr:$src1, imm:$src2)>;
-  def : Pat<(X86bextr GR64:$src1, i64immSExt32:$src2),
-            (BEXTRI64ri GR64:$src1, i64immSExt32:$src2)>;
-  def : Pat<(X86bextr (loadi64 addr:$src1), i64immSExt32:$src2),
-            (BEXTRI64mi addr:$src1, i64immSExt32:$src2)>;
-
   // FIXME: patterns for the load versions are not implemented
   def : Pat<(and GR32:$src, (add GR32:$src, 1)),
             (BLCFILL32rr GR32:$src)>;
Index: test/CodeGen/X86/bmi.ll
===================================================================
--- test/CodeGen/X86/bmi.ll
+++ test/CodeGen/X86/bmi.ll
@@ -311,6 +311,18 @@
   ret i32 %2
 }
 
+; Make sure we still use AH subreg trick to extract 15:8
+define i32 @bextr32_subreg(i32 %x)  uwtable  ssp {
+; CHECK-LABEL: bextr32_subreg:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movzbl %ah, %eax # NOREX
+; CHECK-NEXT:    retq
+  %1 = lshr i32 %x, 8
+  %2 = and i32 %1, 255
+  ret i32 %2
+}
+
 define i32 @bextr32b_load(i32* %x)  uwtable  ssp {
 ; CHECK-LABEL: bextr32b_load:
 ; CHECK:       # BB#0:
@@ -357,6 +369,18 @@
   ret i64 %2
 }
 
+; Make sure we still use the AH subreg trick to extract 15:8
+define i64 @bextr64_subreg(i64 %x)  uwtable  ssp {
+; CHECK-LABEL: bextr64_subreg:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movzbl %ah, %eax # NOREX
+; CHECK-NEXT:    retq
+  %1 = lshr i64 %x, 8
+  %2 = and i64 %1, 255
+  ret i64 %2
+}
+
 define i64 @bextr64b_load(i64* %x) {
 ; CHECK-LABEL: bextr64b_load:
 ; CHECK:       # BB#0:
Index: test/CodeGen/X86/tbm_patterns.ll
===================================================================
--- test/CodeGen/X86/tbm_patterns.ll
+++ test/CodeGen/X86/tbm_patterns.ll
@@ -13,6 +13,18 @@
   ret i32 %t1
 }
 
+; Make sure we still use AH subreg trick for extracting bits 15:8
+define i32 @test_x86_tbm_bextri_u32_subreg(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u32_subreg:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movzbl %ah, %eax # NOREX
+; CHECK-NEXT:    retq
+  %t0 = lshr i32 %a, 8
+  %t1 = and i32 %t0, 255
+  ret i32 %t1
+}
+
 define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u32_m:
 ; CHECK:       # BB#0:
@@ -40,7 +52,8 @@
 define i32 @test_x86_tbm_bextri_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u32_z2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    bextr $3076, %edi, %eax # imm = 0xC04
+; CHECK-NEXT:    shrl $4, %edi
+; CHECK-NEXT:    testw $4095, %di # imm = 0xFFF
 ; CHECK-NEXT:    cmovnel %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    retq
@@ -61,6 +74,18 @@
   ret i64 %t1
 }
 
+; Make sure we still use AH subreg trick for extracting bits 15:8
+define i64 @test_x86_tbm_bextri_u64_subreg(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u64_subreg:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movzbl %ah, %eax # NOREX
+; CHECK-NEXT:    retq
+  %t0 = lshr i64 %a, 8
+  %t1 = and i64 %t0, 255
+  ret i64 %t1
+}
+
 define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u64_m:
 ; CHECK:       # BB#0:
@@ -88,7 +113,8 @@
 define i64 @test_x86_tbm_bextri_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u64_z2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    bextr $3076, %edi, %eax # imm = 0xC04
+; CHECK-NEXT:    shrl $4, %edi
+; CHECK-NEXT:    testw $4095, %di # imm = 0xFFF
 ; CHECK-NEXT:    cmovneq %rdx, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    retq