Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
@@ -33066,6 +33066,124 @@
   return DAG.getBitcast(N->getValueType(0), Shift);
 }
 
+// Get the index node from the lowered DAG of a GEP IR instruction with one
+// indexing dimension.
+static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
+  if (Ld->isIndexed())
+    return SDValue();
+
+  SDValue Base = Ld->getBasePtr();
+
+  if (Base.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  SDValue ShiftedIndex = Base.getOperand(0);
+
+  if (ShiftedIndex.getOpcode() != ISD::SHL)
+    return SDValue();
+
+  return ShiftedIndex.getOperand(0);
+
+}
+
+static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
+  if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
+    switch (VT.getSizeInBits()) {
+    default: return false;
+    case 64: return Subtarget.is64Bit() ? true : false;
+    case 32: return true;
+    }
+  }
+  return false;
+}
+
+// This function recognizes cases where X86 bzhi instruction can replace and
+// 'and-load' sequence.
+// In case of loading integer value from an array of constants which is defined
+// as follows:
+//
+//   int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
+//
+// then applying a bitwise and on the result with another input.
+// It's equivalent to performing bzhi (zero high bits) on the input, with the
+// same index of the load.
+static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
+    const X86Subtarget &Subtarget) {
+  MVT VT = Node->getSimpleValueType(0);
+  SDLoc dl(Node);
+
+  // Check if subtarget has BZHI instruction for the node's type
+  if (!hasBZHI(Subtarget, VT))
+    return SDValue();
+
+  // Try matching the pattern for both operands.
+  for (unsigned i = 0; i < 2; i++) {
+    SDValue N = Node->getOperand(i);
+    LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
+
+     // continue if the operand is not a load instruction
+    if (!Ld)
+      return SDValue();
+
+    const Value *MemOp = Ld->getMemOperand()->getValue();
+
+    if (!MemOp)
+      return SDValue();
+
+    if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
+      if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
+        if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
+
+          Constant *Init = GV->getInitializer();
+          Type *Ty = Init->getType();
+          if (!isa<ConstantDataArray>(Init) ||
+              !Ty->getArrayElementType()->isIntegerTy() ||
+              Ty->getArrayElementType()->getScalarSizeInBits() !=
+                  VT.getSizeInBits() ||
+              Ty->getArrayNumElements() >
+                  Ty->getArrayElementType()->getScalarSizeInBits())
+            continue;
+
+          // Check if the array's constant elements are suitable to our case.
+          uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
+          bool ConstantsMatch = true;
+          for (uint64_t j = 0; j < ArrayElementCount; j++) {
+            ConstantInt *Elem =
+                dyn_cast<ConstantInt>(Init->getAggregateElement(j));
+            if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
+              ConstantsMatch = false;
+              break;
+            }
+          }
+          if (!ConstantsMatch)
+            continue;
+
+          // Do the transformation (For 32-bit type):
+          // -> (and (load arr[idx]), inp)
+          // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
+          //    that will be replaced with one bzhi instruction.
+          SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
+          SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
+
+          // Get the Node which indexes into the array.
+          SDValue Index = getIndexFromUnindexedLoad(Ld);
+          if (!Index)
+            return SDValue();
+          Index = DAG.getZExtOrTrunc(Index, dl, VT);
+
+          SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
+
+          SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
+          SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
+
+          return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
@@ -33094,6 +33212,9 @@
   if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
     return ShiftRight;
 
+  if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
+    return R;
+
   // Attempt to recursively combine a bitmask AND with shuffles.
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
     SDValue Op(N, 0);
Index: llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll
+++ llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll
@@ -10,17 +10,14 @@
 define i32 @f32_bzhi(i32 %x, i32 %y) local_unnamed_addr {
 ; CHECK-LABEL: f32_bzhi:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movslq %esi, %rax
-; CHECK-NEXT:    andl fill_table32(,%rax,4), %edi
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    ret{{[l|q]}}
+; CHECK-NEXT:    bzhil %esi, %edi, %eax
+; CHECK-NEXT:    retq
 ;
 ; CHECK32-LABEL: f32_bzhi:
 ; CHECK32:       # %bb.0: # %entry
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movl fill_table32(,%eax,4), %eax
-; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    ret{{[l|q]}}
+; CHECK32-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    retl
 entry:
   %idxprom = sext i32 %y to i64
   %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @fill_table32, i64 0, i64 %idxprom
@@ -32,17 +29,14 @@
 define i32 @f32_bzhi_partial(i32 %x, i32 %y) local_unnamed_addr {
 ; CHECK-LABEL: f32_bzhi_partial:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movslq %esi, %rax
-; CHECK-NEXT:    andl fill_table32_partial(,%rax,4), %edi
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    ret{{[l|q]}}
+; CHECK-NEXT:    bzhil %esi, %edi, %eax
+; CHECK-NEXT:    retq
 ;
 ; CHECK32-LABEL: f32_bzhi_partial:
 ; CHECK32:       # %bb.0: # %entry
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movl fill_table32_partial(,%eax,4), %eax
-; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    ret{{[l|q]}}
+; CHECK32-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    retl
 entry:
   %idxprom = sext i32 %y to i64
   %arrayidx = getelementptr inbounds [17 x i32], [17 x i32]* @fill_table32_partial, i64 0, i64 %idxprom
@@ -54,9 +48,8 @@
 define i64 @f64_bzhi(i64 %x, i64 %y) local_unnamed_addr {
 ; CHECK-LABEL: f64_bzhi:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andq fill_table64(,%rsi,8), %rdi
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    ret{{[l|q]}}
+; CHECK-NEXT:    bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT:    retq
 ;
 ; CHECK32-LABEL: f64_bzhi:
 ; CHECK32:       # %bb.0: # %entry
@@ -65,7 +58,7 @@
 ; CHECK32-NEXT:    movl fill_table64(,%eax,8), %eax
 ; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; CHECK32-NEXT:    ret{{[l|q]}}
+; CHECK32-NEXT:    retl
 entry:
   %arrayidx = getelementptr inbounds [64 x i64], [64 x i64]* @fill_table64, i64 0, i64 %y
   %0 = load i64, i64* %arrayidx, align 8
@@ -76,9 +69,8 @@
 define i64 @f64_bzhi_partial(i64 %x, i64 %y) local_unnamed_addr {
 ; CHECK-LABEL: f64_bzhi_partial:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andq fill_table64_partial(,%rsi,8), %rdi
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    ret{{[l|q]}}
+; CHECK-NEXT:    bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT:    retq
 ;
 ; CHECK32-LABEL: f64_bzhi_partial:
 ; CHECK32:       # %bb.0: # %entry
@@ -87,7 +79,7 @@
 ; CHECK32-NEXT:    movl fill_table64_partial(,%eax,8), %eax
 ; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; CHECK32-NEXT:    ret{{[l|q]}}
+; CHECK32-NEXT:    retl
 entry:
   %arrayidx = getelementptr inbounds [51 x i64], [51 x i64]* @fill_table64_partial, i64 0, i64 %y
   %0 = load i64, i64* %arrayidx, align 8