Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -33592,6 +33592,35 @@
                                 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
   }
 
+  // Use a 32-bit and+zext if one input is an extend and the other already
+  // has zeros in the upper bits.
+  // TODO: Can we narrow the and even without the extend for an encoding
+  // size improvement. Unfortunately, doing this naively leads to infinite
+  // loops in DAG combine.
+  if (VT == MVT::i64 && Subtarget.is64Bit()) {
+    SDValue LHS = N->getOperand(0);
+    SDValue RHS = N->getOperand(1);
+    APInt HiMask = APInt::getHighBitsSet(64, 32);
+    if (LHS.getOpcode() == ISD::ANY_EXTEND &&
+        LHS.getOperand(0).getValueType() == MVT::i32 &&
+        DAG.MaskedValueIsZero(RHS, HiMask)) {
+      SDLoc dl(N);
+      RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, RHS);
+      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
+                         DAG.getNode(ISD::AND, dl, MVT::i32, LHS.getOperand(0),
+                                     RHS));
+    }
+    if (RHS.getOpcode() == ISD::ANY_EXTEND &&
+        RHS.getOperand(0).getValueType() == MVT::i32 &&
+        DAG.MaskedValueIsZero(LHS, HiMask)) {
+      SDLoc dl(N);
+      LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, LHS);
+      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
+                         DAG.getNode(ISD::AND, dl, MVT::i32, LHS,
+                                     RHS.getOperand(0)));
+    }
+  }
+
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
Index: test/CodeGen/X86/bmi.ll
===================================================================
--- test/CodeGen/X86/bmi.ll
+++ test/CodeGen/X86/bmi.ll
@@ -822,13 +822,13 @@
   ret i64 %r
 }
 
-; The add here gets shrunk, but the and does not thus hiding the blsr pattern.
+; The add here used to get shrunk, but the and did not thus hiding the blsr pattern.
+; We now use the knowledge that upper bits of the shift guarantee the and result has 0s in the upper bits to reduce it too.
 define i64 @blsr_disguised_shrunk_add(i64 %x) {
 ; CHECK-LABEL: blsr_disguised_shrunk_add:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    shrq $48, %rdi
-; CHECK-NEXT:    leal -1(%rdi), %eax
-; CHECK-NEXT:    andq %rdi, %rax
+; CHECK-NEXT:    blsrl %edi, %eax
 ; CHECK-NEXT:    retq
   %a = lshr i64 %x, 48
   %b = add i64 %a, -1
Index: test/CodeGen/X86/var-permute-256.ll
===================================================================
--- test/CodeGen/X86/var-permute-256.ll
+++ test/CodeGen/X86/var-permute-256.ll
@@ -1575,19 +1575,19 @@
 ; XOP-NEXT:    vpextrd $2, %xmm1, %r9d
 ; XOP-NEXT:    vpextrd $3, %xmm1, %r10d
 ; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; XOP-NEXT:    vmovd %xmm1, %edx
-; XOP-NEXT:    vpextrd $1, %xmm1, %edi
-; XOP-NEXT:    vpextrd $2, %xmm1, %eax
-; XOP-NEXT:    vpextrd $3, %xmm1, %ecx
+; XOP-NEXT:    vmovd %xmm1, %edi
+; XOP-NEXT:    vpextrd $1, %xmm1, %eax
+; XOP-NEXT:    vpextrd $2, %xmm1, %ecx
+; XOP-NEXT:    vpextrd $3, %xmm1, %edx
 ; XOP-NEXT:    vmovaps %ymm0, (%rsp)
 ; XOP-NEXT:    andl $7, %esi
 ; XOP-NEXT:    andl $7, %r8d
 ; XOP-NEXT:    andl $7, %r9d
 ; XOP-NEXT:    andl $7, %r10d
-; XOP-NEXT:    andl $7, %edx
 ; XOP-NEXT:    andl $7, %edi
 ; XOP-NEXT:    andl $7, %eax
 ; XOP-NEXT:    andl $7, %ecx
+; XOP-NEXT:    andl $7, %edx
 ; XOP-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; XOP-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; XOP-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -1612,19 +1612,19 @@
 ; AVX1-NEXT:    vpextrd $2, %xmm1, %r9d
 ; AVX1-NEXT:    vpextrd $3, %xmm1, %r10d
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vmovd %xmm1, %edx
-; AVX1-NEXT:    vpextrd $1, %xmm1, %edi
-; AVX1-NEXT:    vpextrd $2, %xmm1, %eax
-; AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
+; AVX1-NEXT:    vmovd %xmm1, %edi
+; AVX1-NEXT:    vpextrd $1, %xmm1, %eax
+; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
+; AVX1-NEXT:    vpextrd $3, %xmm1, %edx
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX1-NEXT:    andl $7, %esi
 ; AVX1-NEXT:    andl $7, %r8d
 ; AVX1-NEXT:    andl $7, %r9d
 ; AVX1-NEXT:    andl $7, %r10d
-; AVX1-NEXT:    andl $7, %edx
 ; AVX1-NEXT:    andl $7, %edi
 ; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    andl $7, %ecx
+; AVX1-NEXT:    andl $7, %edx
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]