Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -33592,6 +33592,35 @@ DAG.getBitcast(MVT::v4f32, N->getOperand(1)))); } + // Use a 32-bit and+zext if one input is an extend and the other already + // has zeros in the upper bits. + // TODO: Can we narrow the and even without the extend for an encoding + // size improvement. Unfortunately, doing this naively leads to infinite + // loops in DAG combine. + if (VT == MVT::i64 && Subtarget.is64Bit()) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + APInt HiMask = APInt::getHighBitsSet(64, 32); + if (LHS.getOpcode() == ISD::ANY_EXTEND && + LHS.getOperand(0).getValueType() == MVT::i32 && + DAG.MaskedValueIsZero(RHS, HiMask)) { + SDLoc dl(N); + RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, RHS); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, + DAG.getNode(ISD::AND, dl, MVT::i32, LHS.getOperand(0), + RHS)); + } + if (RHS.getOpcode() == ISD::ANY_EXTEND && + RHS.getOperand(0).getValueType() == MVT::i32 && + DAG.MaskedValueIsZero(LHS, HiMask)) { + SDLoc dl(N); + LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, LHS); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, + DAG.getNode(ISD::AND, dl, MVT::i32, LHS, + RHS.getOperand(0))); + } + } + if (DCI.isBeforeLegalizeOps()) return SDValue(); Index: test/CodeGen/X86/bmi.ll =================================================================== --- test/CodeGen/X86/bmi.ll +++ test/CodeGen/X86/bmi.ll @@ -822,13 +822,13 @@ ret i64 %r } -; The add here gets shrunk, but the and does not thus hiding the blsr pattern. +; The add here used to get shrunk, but the and did not thus hiding the blsr pattern. +; We now use the knowledge that upper bits of the shift guarantee the and result has 0s in the upper bits to reduce it too. define i64 @blsr_disguised_shrunk_add(i64 %x) { ; CHECK-LABEL: blsr_disguised_shrunk_add: ; CHECK: # %bb.0: ; CHECK-NEXT: shrq $48, %rdi -; CHECK-NEXT: leal -1(%rdi), %eax -; CHECK-NEXT: andq %rdi, %rax +; CHECK-NEXT: blsrl %edi, %eax ; CHECK-NEXT: retq %a = lshr i64 %x, 48 %b = add i64 %a, -1 Index: test/CodeGen/X86/var-permute-256.ll =================================================================== --- test/CodeGen/X86/var-permute-256.ll +++ test/CodeGen/X86/var-permute-256.ll @@ -1575,19 +1575,19 @@ ; XOP-NEXT: vpextrd $2, %xmm1, %r9d ; XOP-NEXT: vpextrd $3, %xmm1, %r10d ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 -; XOP-NEXT: vmovd %xmm1, %edx -; XOP-NEXT: vpextrd $1, %xmm1, %edi -; XOP-NEXT: vpextrd $2, %xmm1, %eax -; XOP-NEXT: vpextrd $3, %xmm1, %ecx +; XOP-NEXT: vmovd %xmm1, %edi +; XOP-NEXT: vpextrd $1, %xmm1, %eax +; XOP-NEXT: vpextrd $2, %xmm1, %ecx +; XOP-NEXT: vpextrd $3, %xmm1, %edx ; XOP-NEXT: vmovaps %ymm0, (%rsp) ; XOP-NEXT: andl $7, %esi ; XOP-NEXT: andl $7, %r8d ; XOP-NEXT: andl $7, %r9d ; XOP-NEXT: andl $7, %r10d -; XOP-NEXT: andl $7, %edx ; XOP-NEXT: andl $7, %edi ; XOP-NEXT: andl $7, %eax ; XOP-NEXT: andl $7, %ecx +; XOP-NEXT: andl $7, %edx ; XOP-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; XOP-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; XOP-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -1612,19 +1612,19 @@ ; AVX1-NEXT: vpextrd $2, %xmm1, %r9d ; AVX1-NEXT: vpextrd $3, %xmm1, %r10d ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %edx -; AVX1-NEXT: vpextrd $1, %xmm1, %edi -; AVX1-NEXT: vpextrd $2, %xmm1, %eax -; AVX1-NEXT: vpextrd $3, %xmm1, %ecx +; AVX1-NEXT: vmovd %xmm1, %edi +; AVX1-NEXT: vpextrd $1, %xmm1, %eax +; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: vpextrd $3, %xmm1, %edx ; AVX1-NEXT: vmovaps %ymm0, (%rsp) ; AVX1-NEXT: andl $7, %esi ; AVX1-NEXT: andl $7, %r8d ; AVX1-NEXT: andl $7, %r9d ; AVX1-NEXT: andl $7, %r10d -; AVX1-NEXT: andl $7, %edx ; AVX1-NEXT: andl $7, %edi ; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: andl $7, %ecx +; AVX1-NEXT: andl $7, %edx ; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]