Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -31762,6 +31762,40 @@ return SDValue(); } +static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + + // Try to improve a sequence of srl (and X, C1), C2 by inverting the order. + // TODO: This is a generic DAG combine that became an x86-only combine to + // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and + // and-not ('andn'). + if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) + return SDValue(); + + auto *ShiftC = dyn_cast(N1); + auto *AndC = dyn_cast(N0.getOperand(1)); + if (!ShiftC || !AndC) + return SDValue(); + + // If the 'and' mask is already smaller than a byte, then don't bother. + // If the new 'and' mask would be bigger than a byte, then don't bother. + // If the mask fits in a byte, then we know we can generate smaller and + // potentially better code by shifting first. + // TODO: Always try to shrink a mask that is over 32-bits? + APInt MaskVal = AndC->getAPIntValue(); + APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue()); + if (MaskVal.getMinSignedBits() <= 8 || NewMaskVal.getMinSignedBits() > 8) + return SDValue(); + + // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC) + SDLoc DL(N); + SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT); + SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1); + return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask); +} + /// \brief Returns a vector of 0s if the node in input is a vector logical /// shift by a constant amount which is known to be bigger than or equal /// to the vector element size in bits. @@ -31804,6 +31838,10 @@ if (SDValue V = combineShiftRightAlgebraic(N, DAG)) return V; + if (N->getOpcode() == ISD::SRL) + if (SDValue V = combineShiftRightLogical(N, DAG)) + return V; + // Try to fold this logical shift into a zero vector. if (N->getOpcode() != ISD::SRA) if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) Index: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll @@ -129,8 +129,8 @@ ; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] ; X32-NEXT: vpmovb2m %zmm1, %k0 ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $61440, %ecx # imm = 0xF000 ; X32-NEXT: shrl $12, %ecx +; X32-NEXT: andl $15, %ecx ; X32-NEXT: kmovd %ecx, %k1 ; X32-NEXT: vpmovm2b %k1, %zmm1 ; X32-NEXT: vpbroadcastd %xmm1, %xmm1 @@ -151,8 +151,8 @@ ; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] ; X32-NEXT: vpmovb2m %zmm1, %k0 ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $49152, %ecx # imm = 0xC000 ; X32-NEXT: shrl $14, %ecx +; X32-NEXT: andl $3, %ecx ; X32-NEXT: kmovd %ecx, %k1 ; X32-NEXT: vpmovm2b %k1, %zmm1 ; X32-NEXT: vpbroadcastw %xmm1, %xmm1 @@ -162,8 +162,8 @@ ; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] ; X32-NEXT: vpmovb2m %zmm1, %k0 ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $32768, %ecx # imm = 0x8000 ; X32-NEXT: shrl $15, %ecx +; X32-NEXT: andl $1, %ecx ; X32-NEXT: kmovd %ecx, %k1 ; X32-NEXT: vpmovm2b %k1, %zmm1 ; X32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] @@ -483,8 +483,8 @@ ; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X32-NEXT: vpmovb2m %zmm0, %k0 ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $61440, %ecx # imm = 0xF000 ; X32-NEXT: shrl $12, %ecx +; X32-NEXT: andl $15, %ecx ; X32-NEXT: kmovd %ecx, %k1 ; X32-NEXT: vpmovm2b %k1, %zmm0 ; X32-NEXT: vpbroadcastd %xmm0, %xmm0 @@ -507,8 +507,8 @@ ; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X32-NEXT: vpmovb2m %zmm0, %k0 ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $49152, %ecx # imm = 0xC000 ; X32-NEXT: shrl $14, %ecx +; X32-NEXT: andl $3, %ecx ; X32-NEXT: kmovd %ecx, %k1 ; X32-NEXT: vpmovm2b %k1, %zmm0 ; X32-NEXT: vpbroadcastw %xmm0, %xmm0 @@ -519,8 +519,8 @@ ; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X32-NEXT: vpmovb2m %zmm0, %k0 ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $32768, %ecx # imm = 0x8000 ; X32-NEXT: shrl $15, %ecx +; X32-NEXT: andl $1, %ecx ; X32-NEXT: kmovd %ecx, %k1 ; X32-NEXT: vpmovm2b %k1, %zmm0 ; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] @@ -860,8 +860,8 @@ ; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; X32-NEXT: vpmovb2m %zmm0, %k0 ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $61440, %ecx # imm = 0xF000 ; X32-NEXT: shrl $12, %ecx +; X32-NEXT: andl $15, %ecx ; X32-NEXT: kmovd %ecx, %k1 ; X32-NEXT: vpmovm2b %k1, %zmm0 ; X32-NEXT: vpbroadcastd %xmm0, %xmm0 @@ -882,8 +882,8 @@ ; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; X32-NEXT: vpmovb2m %zmm0, %k0 ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $49152, %ecx # imm = 0xC000 ; X32-NEXT: shrl $14, %ecx +; X32-NEXT: andl $3, %ecx ; X32-NEXT: kmovd %ecx, %k1 ; X32-NEXT: vpmovm2b %k1, %zmm0 ; X32-NEXT: vpbroadcastw %xmm0, %xmm0 @@ -893,8 +893,8 @@ ; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; X32-NEXT: vpmovb2m %zmm0, %k0 ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $32768, %ecx # imm = 0x8000 ; X32-NEXT: shrl $15, %ecx +; X32-NEXT: andl $1, %ecx ; X32-NEXT: kmovd %ecx, %k1 ; X32-NEXT: vpmovm2b %k1, %zmm0 ; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] @@ -1214,8 +1214,8 @@ ; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X32-NEXT: vpmovb2m %zmm0, %k0 ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $61440, %ecx # imm = 0xF000 ; X32-NEXT: shrl $12, %ecx +; X32-NEXT: andl $15, %ecx ; X32-NEXT: kmovd %ecx, %k1 ; X32-NEXT: vpmovm2b %k1, %zmm0 ; X32-NEXT: vpbroadcastd %xmm0, %xmm0 @@ -1238,8 +1238,8 @@ ; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X32-NEXT: vpmovb2m %zmm0, %k0 ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $49152, %ecx # imm = 0xC000 ; X32-NEXT: shrl $14, %ecx +; X32-NEXT: andl $3, %ecx ; X32-NEXT: kmovd %ecx, %k1 ; X32-NEXT: vpmovm2b %k1, %zmm0 ; X32-NEXT: vpbroadcastw %xmm0, %xmm0 @@ -1250,8 +1250,8 @@ ; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X32-NEXT: vpmovb2m %zmm0, %k0 ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $32768, %ecx # imm = 0x8000 ; X32-NEXT: shrl $15, %ecx +; X32-NEXT: andl $1, %ecx ; X32-NEXT: kmovd %ecx, %k1 ; X32-NEXT: vpmovm2b %k1, %zmm0 ; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] Index: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1734,19 +1734,29 @@ ; ; AVX512F-32-LABEL: test_mask_cmp_b_512: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: pushl %ebx +; AVX512F-32-NEXT: pushl %ebp ; AVX512F-32-NEXT: .Lcfi5: ; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 -; AVX512F-32-NEXT: pushl %esi +; AVX512F-32-NEXT: pushl %ebx ; AVX512F-32-NEXT: .Lcfi6: ; AVX512F-32-NEXT: .cfi_def_cfa_offset 12 -; AVX512F-32-NEXT: subl $60, %esp +; AVX512F-32-NEXT: pushl %edi ; AVX512F-32-NEXT: .Lcfi7: -; AVX512F-32-NEXT: .cfi_def_cfa_offset 72 +; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-32-NEXT: pushl %esi ; AVX512F-32-NEXT: .Lcfi8: -; AVX512F-32-NEXT: .cfi_offset %esi, -12 +; AVX512F-32-NEXT: .cfi_def_cfa_offset 20 +; AVX512F-32-NEXT: subl $60, %esp ; AVX512F-32-NEXT: .Lcfi9: -; AVX512F-32-NEXT: .cfi_offset %ebx, -8 +; AVX512F-32-NEXT: .cfi_def_cfa_offset 80 +; AVX512F-32-NEXT: .Lcfi10: +; AVX512F-32-NEXT: .cfi_offset %esi, -20 +; AVX512F-32-NEXT: .Lcfi11: +; AVX512F-32-NEXT: .cfi_offset %edi, -16 +; AVX512F-32-NEXT: .Lcfi12: +; AVX512F-32-NEXT: .cfi_offset %ebx, -12 +; AVX512F-32-NEXT: .Lcfi13: +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $5, %al @@ -1862,9 +1872,8 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000 -; AVX512F-32-NEXT: shrl $12, %eax +; AVX512F-32-NEXT: movl $1036, %edi # imm = 0x40C +; AVX512F-32-NEXT: bextrl %edi, %ecx, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 @@ -1874,6 +1883,7 @@ ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ecx, %ebp ; AVX512F-32-NEXT: shrl $13, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -1884,9 +1894,8 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000 -; AVX512F-32-NEXT: shrl $14, %eax +; AVX512F-32-NEXT: movl $526, %edx # imm = 0x20E +; AVX512F-32-NEXT: bextrl %edx, %ebp, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 @@ -1894,10 +1903,9 @@ ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-32-NEXT: movl $271, %esi # imm = 0x10F +; AVX512F-32-NEXT: bextrl %esi, %ebp, %eax ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000 -; AVX512F-32-NEXT: shrl $15, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] @@ -1906,9 +1914,8 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrl $16, %eax -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: shrl $16, %ecx +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 @@ -1916,10 +1923,10 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: andb $2, %dl -; AVX512F-32-NEXT: shrb %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: andb $2, %al +; AVX512F-32-NEXT: shrb %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -1928,9 +1935,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %ebx +; AVX512F-32-NEXT: movl %ecx, %ebx ; AVX512F-32-NEXT: andb $15, %bl -; AVX512F-32-NEXT: movl %ebx, %edx +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $2, %bl ; AVX512F-32-NEXT: kmovd %ebx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -1941,8 +1948,8 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: shrb $3, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: shrb $3, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -1951,9 +1958,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: shrb $4, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $4, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -1962,10 +1969,10 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: shrb $5, %dl -; AVX512F-32-NEXT: andb $1, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $5, %al +; AVX512F-32-NEXT: andb $1, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -1974,9 +1981,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: shrb $6, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $6, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -1985,9 +1992,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: # kill: %AL %AL %EAX %EAX -; AVX512F-32-NEXT: shrb $7, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: # kill: %CL %CL %ECX %ECX +; AVX512F-32-NEXT: shrb $7, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -1996,9 +2003,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrl $24, %eax -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %ebp, %ebx +; AVX512F-32-NEXT: shrl $24, %ebx +; AVX512F-32-NEXT: kmovd %ebx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm3 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 @@ -2006,10 +2013,10 @@ ; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: andb $2, %dl -; AVX512F-32-NEXT: shrb %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: andb $2, %al +; AVX512F-32-NEXT: shrb %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6] ; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 @@ -2018,11 +2025,11 @@ ; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movl %edx, %eax -; AVX512F-32-NEXT: shrb $2, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: # kill: %BL %BL %EBX %EBX +; AVX512F-32-NEXT: andb $15, %bl +; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: shrb $2, %bl +; AVX512F-32-NEXT: kmovd %ebx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 ; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4 ; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 @@ -2041,6 +2048,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 +; AVX512F-32-NEXT: movl %ebp, %ecx ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $28, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -2053,7 +2061,6 @@ ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: movl %ecx, %esi ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -2065,7 +2072,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm2, %ymm7, %ymm7 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %esi, %eax +; AVX512F-32-NEXT: movl %ebp, %eax ; AVX512F-32-NEXT: shrl $30, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2076,7 +2083,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %esi, %eax +; AVX512F-32-NEXT: movl %ebp, %eax ; AVX512F-32-NEXT: shrl $31, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2087,8 +2094,8 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; AVX512F-32-NEXT: kmovd %ebx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 @@ -2096,7 +2103,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -2108,11 +2115,11 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %edx -; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movl %edx, %eax -; AVX512F-32-NEXT: shrb $2, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: andb $15, %cl +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $2, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 @@ -2131,7 +2138,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2142,7 +2149,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -2154,7 +2161,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2165,7 +2172,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $7, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2176,7 +2183,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %ch, %al +; AVX512F-32-NEXT: movb %bh, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2 @@ -2197,11 +2204,11 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %ch, %dl -; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movl %edx, %eax -; AVX512F-32-NEXT: shrb $2, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movb %bh, %cl +; AVX512F-32-NEXT: andb $15, %cl +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $2, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 @@ -2220,9 +2227,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000 -; AVX512F-32-NEXT: shrl $12, %eax +; AVX512F-32-NEXT: bextrl %edi, %ebx, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 @@ -2232,7 +2237,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrl $13, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -2244,9 +2249,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000 -; AVX512F-32-NEXT: shrl $14, %eax +; AVX512F-32-NEXT: bextrl %edx, %ebx, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 @@ -2255,10 +2258,8 @@ ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: bextrl %esi, %ebx, %eax ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000 -; AVX512F-32-NEXT: shrl $15, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] @@ -2268,9 +2269,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %ebx -; AVX512F-32-NEXT: shrl $16, %ebx -; AVX512F-32-NEXT: kmovd %ebx, %k1 +; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: shrl $16, %eax +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 @@ -2279,10 +2280,10 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %edx -; AVX512F-32-NEXT: andb $2, %dl -; AVX512F-32-NEXT: shrb %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: andb $2, %cl +; AVX512F-32-NEXT: shrb %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -2291,11 +2292,11 @@ ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: andb $15, %al -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: shrb $2, %al -; AVX512F-32-NEXT: kmovd %eax, %k0 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: andb $15, %cl +; AVX512F-32-NEXT: movl %ecx, %edx +; AVX512F-32-NEXT: shrb $2, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k0 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 @@ -2317,9 +2318,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrb $4, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: shrb $4, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -2329,10 +2330,10 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrb $5, %al -; AVX512F-32-NEXT: andb $1, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: shrb $5, %cl +; AVX512F-32-NEXT: andb $1, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -2342,9 +2343,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrb $6, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: shrb $6, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -2354,9 +2355,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: # kill: %BL %BL %EBX %EBX -; AVX512F-32-NEXT: shrb $7, %bl -; AVX512F-32-NEXT: kmovd %ebx, %k1 +; AVX512F-32-NEXT: # kill: %AL %AL %EAX %EAX +; AVX512F-32-NEXT: shrb $7, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -2366,7 +2367,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrl $24, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2377,10 +2378,10 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: andb $2, %dl -; AVX512F-32-NEXT: shrb %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: andb $2, %cl +; AVX512F-32-NEXT: shrb %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -2389,11 +2390,11 @@ ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movl %edx, %eax -; AVX512F-32-NEXT: shrb $2, %dl -; AVX512F-32-NEXT: kmovd %edx, %k0 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: andb $15, %cl +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $2, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k0 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 @@ -2412,11 +2413,11 @@ ; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4 ; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrl $28, %eax ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 @@ -2437,7 +2438,7 @@ ; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrl $30, %eax ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 @@ -2450,7 +2451,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrl $31, %eax ; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0 ; AVX512F-32-NEXT: kshiftrq $1, %k0, %k0 @@ -2485,11 +2486,13 @@ ; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl %esi, %eax -; AVX512F-32-NEXT: adcxl %ecx, %edx +; AVX512F-32-NEXT: addl %ebp, %eax +; AVX512F-32-NEXT: adcxl %ebx, %edx ; AVX512F-32-NEXT: addl $60, %esp ; AVX512F-32-NEXT: popl %esi +; AVX512F-32-NEXT: popl %edi ; AVX512F-32-NEXT: popl %ebx +; AVX512F-32-NEXT: popl %ebp ; AVX512F-32-NEXT: retl %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask) @@ -2539,7 +2542,7 @@ ; AVX512F-32-LABEL: test_ucmp_b_512: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: subl $60, %esp -; AVX512F-32-NEXT: .Lcfi10: +; AVX512F-32-NEXT: .Lcfi14: ; AVX512F-32-NEXT: .cfi_def_cfa_offset 64 ; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) @@ -2618,19 +2621,29 @@ ; ; AVX512F-32-LABEL: test_mask_x86_avx512_ucmp_b_512: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: pushl %ebx -; AVX512F-32-NEXT: .Lcfi11: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .Lcfi15: ; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 -; AVX512F-32-NEXT: pushl %esi -; AVX512F-32-NEXT: .Lcfi12: +; AVX512F-32-NEXT: pushl %ebx +; AVX512F-32-NEXT: .Lcfi16: ; AVX512F-32-NEXT: .cfi_def_cfa_offset 12 +; AVX512F-32-NEXT: pushl %edi +; AVX512F-32-NEXT: .Lcfi17: +; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-32-NEXT: pushl %esi +; AVX512F-32-NEXT: .Lcfi18: +; AVX512F-32-NEXT: .cfi_def_cfa_offset 20 ; AVX512F-32-NEXT: subl $60, %esp -; AVX512F-32-NEXT: .Lcfi13: -; AVX512F-32-NEXT: .cfi_def_cfa_offset 72 -; AVX512F-32-NEXT: .Lcfi14: -; AVX512F-32-NEXT: .cfi_offset %esi, -12 -; AVX512F-32-NEXT: .Lcfi15: -; AVX512F-32-NEXT: .cfi_offset %ebx, -8 +; AVX512F-32-NEXT: .Lcfi19: +; AVX512F-32-NEXT: .cfi_def_cfa_offset 80 +; AVX512F-32-NEXT: .Lcfi20: +; AVX512F-32-NEXT: .cfi_offset %esi, -20 +; AVX512F-32-NEXT: .Lcfi21: +; AVX512F-32-NEXT: .cfi_offset %edi, -16 +; AVX512F-32-NEXT: .Lcfi22: +; AVX512F-32-NEXT: .cfi_offset %ebx, -12 +; AVX512F-32-NEXT: .Lcfi23: +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $5, %al @@ -2746,9 +2759,8 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000 -; AVX512F-32-NEXT: shrl $12, %eax +; AVX512F-32-NEXT: movl $1036, %edi # imm = 0x40C +; AVX512F-32-NEXT: bextrl %edi, %ecx, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 @@ -2758,6 +2770,7 @@ ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ecx, %ebp ; AVX512F-32-NEXT: shrl $13, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -2768,9 +2781,8 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000 -; AVX512F-32-NEXT: shrl $14, %eax +; AVX512F-32-NEXT: movl $526, %edx # imm = 0x20E +; AVX512F-32-NEXT: bextrl %edx, %ebp, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 @@ -2778,10 +2790,9 @@ ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-32-NEXT: movl $271, %esi # imm = 0x10F +; AVX512F-32-NEXT: bextrl %esi, %ebp, %eax ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000 -; AVX512F-32-NEXT: shrl $15, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] @@ -2790,9 +2801,8 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrl $16, %eax -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: shrl $16, %ecx +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 @@ -2800,10 +2810,10 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: andb $2, %dl -; AVX512F-32-NEXT: shrb %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: andb $2, %al +; AVX512F-32-NEXT: shrb %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -2812,9 +2822,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %ebx +; AVX512F-32-NEXT: movl %ecx, %ebx ; AVX512F-32-NEXT: andb $15, %bl -; AVX512F-32-NEXT: movl %ebx, %edx +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $2, %bl ; AVX512F-32-NEXT: kmovd %ebx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2825,8 +2835,8 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: shrb $3, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: shrb $3, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -2835,9 +2845,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: shrb $4, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $4, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -2846,10 +2856,10 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: shrb $5, %dl -; AVX512F-32-NEXT: andb $1, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $5, %al +; AVX512F-32-NEXT: andb $1, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -2858,9 +2868,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: shrb $6, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $6, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -2869,9 +2879,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: # kill: %AL %AL %EAX %EAX -; AVX512F-32-NEXT: shrb $7, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: # kill: %CL %CL %ECX %ECX +; AVX512F-32-NEXT: shrb $7, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -2880,9 +2890,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrl $24, %eax -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %ebp, %ebx +; AVX512F-32-NEXT: shrl $24, %ebx +; AVX512F-32-NEXT: kmovd %ebx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm3 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 @@ -2890,10 +2900,10 @@ ; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: andb $2, %dl -; AVX512F-32-NEXT: shrb %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: andb $2, %al +; AVX512F-32-NEXT: shrb %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6] ; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 @@ -2902,11 +2912,11 @@ ; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movl %edx, %eax -; AVX512F-32-NEXT: shrb $2, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: # kill: %BL %BL %EBX %EBX +; AVX512F-32-NEXT: andb $15, %bl +; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: shrb $2, %bl +; AVX512F-32-NEXT: kmovd %ebx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 ; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4 ; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 @@ -2925,6 +2935,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 +; AVX512F-32-NEXT: movl %ebp, %ecx ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $28, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -2937,7 +2948,6 @@ ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: movl %ecx, %esi ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -2949,7 +2959,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm2, %ymm7, %ymm7 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %esi, %eax +; AVX512F-32-NEXT: movl %ebp, %eax ; AVX512F-32-NEXT: shrl $30, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2960,7 +2970,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %esi, %eax +; AVX512F-32-NEXT: movl %ebp, %eax ; AVX512F-32-NEXT: shrl $31, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2971,8 +2981,8 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX512F-32-NEXT: kmovd %ecx, %k1 +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; AVX512F-32-NEXT: kmovd %ebx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 @@ -2980,7 +2990,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -2992,11 +3002,11 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %edx -; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movl %edx, %eax -; AVX512F-32-NEXT: shrb $2, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %ebx, %ecx +; AVX512F-32-NEXT: andb $15, %cl +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $2, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 @@ -3015,7 +3025,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -3026,7 +3036,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -3038,7 +3048,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -3049,7 +3059,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $7, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -3060,7 +3070,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %ch, %al +; AVX512F-32-NEXT: movb %bh, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2 @@ -3081,11 +3091,11 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %ch, %dl -; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movl %edx, %eax -; AVX512F-32-NEXT: shrb $2, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movb %bh, %cl +; AVX512F-32-NEXT: andb $15, %cl +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $2, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 @@ -3104,9 +3114,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000 -; AVX512F-32-NEXT: shrl $12, %eax +; AVX512F-32-NEXT: bextrl %edi, %ebx, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 @@ -3116,7 +3124,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrl $13, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -3128,9 +3136,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000 -; AVX512F-32-NEXT: shrl $14, %eax +; AVX512F-32-NEXT: bextrl %edx, %ebx, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 @@ -3139,10 +3145,8 @@ ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: bextrl %esi, %ebx, %eax ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000 -; AVX512F-32-NEXT: shrl $15, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] @@ -3152,9 +3156,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %ebx -; AVX512F-32-NEXT: shrl $16, %ebx -; AVX512F-32-NEXT: kmovd %ebx, %k1 +; AVX512F-32-NEXT: movl %ebx, %eax +; AVX512F-32-NEXT: shrl $16, %eax +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 @@ -3163,10 +3167,10 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %edx -; AVX512F-32-NEXT: andb $2, %dl -; AVX512F-32-NEXT: shrb %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: andb $2, %cl +; AVX512F-32-NEXT: shrb %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -3175,11 +3179,11 @@ ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: andb $15, %al -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: shrb $2, %al -; AVX512F-32-NEXT: kmovd %eax, %k0 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: andb $15, %cl +; AVX512F-32-NEXT: movl %ecx, %edx +; AVX512F-32-NEXT: shrb $2, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k0 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 @@ -3201,9 +3205,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrb $4, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: shrb $4, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -3213,10 +3217,10 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrb $5, %al -; AVX512F-32-NEXT: andb $1, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: shrb $5, %cl +; AVX512F-32-NEXT: andb $1, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -3226,9 +3230,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ebx, %eax -; AVX512F-32-NEXT: shrb $6, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: shrb $6, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -3238,9 +3242,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: # kill: %BL %BL %EBX %EBX -; AVX512F-32-NEXT: shrb $7, %bl -; AVX512F-32-NEXT: kmovd %ebx, %k1 +; AVX512F-32-NEXT: # kill: %AL %AL %EAX %EAX +; AVX512F-32-NEXT: shrb $7, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -3250,7 +3254,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrl $24, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -3261,10 +3265,10 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: andb $2, %dl -; AVX512F-32-NEXT: shrb %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: andb $2, %cl +; AVX512F-32-NEXT: shrb %cl +; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -3273,11 +3277,11 @@ ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: movl %eax, %edx -; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movl %edx, %eax -; AVX512F-32-NEXT: shrb $2, %dl -; AVX512F-32-NEXT: kmovd %edx, %k0 +; AVX512F-32-NEXT: movl %eax, %ecx +; AVX512F-32-NEXT: andb $15, %cl +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrb $2, %cl +; AVX512F-32-NEXT: kmovd %ecx, %k0 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 @@ -3296,11 +3300,11 @@ ; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4 ; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrl $28, %eax ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 @@ -3321,7 +3325,7 @@ ; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrl $30, %eax ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 @@ -3334,7 +3338,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrl $31, %eax ; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0 ; AVX512F-32-NEXT: kshiftrq $1, %k0, %k0 @@ -3369,11 +3373,13 @@ ; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: addl %esi, %eax -; AVX512F-32-NEXT: adcxl %ecx, %edx +; AVX512F-32-NEXT: addl %ebp, %eax +; AVX512F-32-NEXT: adcxl %ebx, %edx ; AVX512F-32-NEXT: addl $60, %esp ; AVX512F-32-NEXT: popl %esi +; AVX512F-32-NEXT: popl %edi ; AVX512F-32-NEXT: popl %ebx +; AVX512F-32-NEXT: popl %ebp ; AVX512F-32-NEXT: retl %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask) Index: llvm/trunk/test/CodeGen/X86/divide-by-constant.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/divide-by-constant.ll +++ llvm/trunk/test/CodeGen/X86/divide-by-constant.ll @@ -48,16 +48,16 @@ ; X32: # BB#0: # %entry ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: imull $171, %eax, %eax -; X32-NEXT: andl $65024, %eax # imm = 0xFE00 ; X32-NEXT: shrl $9, %eax +; X32-NEXT: movzwl %ax, %eax ; X32-NEXT: # kill: %AL %AL %EAX ; X32-NEXT: retl ; ; X64-LABEL: test3: ; X64: # BB#0: # %entry ; X64-NEXT: imull $171, %esi, %eax -; X64-NEXT: andl $65024, %eax # imm = 0xFE00 ; X64-NEXT: shrl $9, %eax +; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: # kill: %AL %AL %EAX ; X64-NEXT: retq entry: @@ -167,8 +167,8 @@ ; X32-NEXT: shrb %al ; X32-NEXT: movzbl %al, %eax ; X32-NEXT: imull $211, %eax, %eax -; X32-NEXT: andl $24576, %eax # imm = 0x6000 ; X32-NEXT: shrl $13, %eax +; X32-NEXT: movzwl %ax, %eax ; X32-NEXT: # kill: %AL %AL %EAX ; X32-NEXT: retl ; @@ -177,8 +177,8 @@ ; X64-NEXT: shrb %dil ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: imull $211, %eax, %eax -; X64-NEXT: andl $24576, %eax # imm = 0x6000 ; X64-NEXT: shrl $13, %eax +; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: # kill: %AL %AL %EAX ; X64-NEXT: retq %div = udiv i8 %x, 78 @@ -192,8 +192,8 @@ ; X32-NEXT: shrb $2, %al ; X32-NEXT: movzbl %al, %eax ; X32-NEXT: imull $71, %eax, %eax -; X32-NEXT: andl $6144, %eax # imm = 0x1800 ; X32-NEXT: shrl $11, %eax +; X32-NEXT: movzwl %ax, %eax ; X32-NEXT: # kill: %AL %AL %EAX ; X32-NEXT: retl ; @@ -202,8 +202,8 @@ ; X64-NEXT: shrb $2, %dil ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: imull $71, %eax, %eax -; X64-NEXT: andl $6144, %eax # imm = 0x1800 ; X64-NEXT: shrl $11, %eax +; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: # kill: %AL %AL %EAX ; X64-NEXT: retq %div = udiv i8 %x, 116 Index: llvm/trunk/test/CodeGen/X86/known-bits.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/known-bits.ll +++ llvm/trunk/test/CodeGen/X86/known-bits.ll @@ -12,8 +12,8 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movzbl (%eax), %eax ; X32-NEXT: imull $101, %eax, %eax -; X32-NEXT: andl $16384, %eax # imm = 0x4000 ; X32-NEXT: shrl $14, %eax +; X32-NEXT: movzwl %ax, %eax ; X32-NEXT: movzbl %al, %eax ; X32-NEXT: vmovd %eax, %xmm0 ; X32-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -50,8 +50,8 @@ ; X64: # BB#0: # %BB ; X64-NEXT: movzbl (%rdi), %eax ; X64-NEXT: imull $101, %eax, %eax -; X64-NEXT: andl $16384, %eax # imm = 0x4000 ; X64-NEXT: shrl $14, %eax +; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: vmovd %eax, %xmm0 ; X64-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero Index: llvm/trunk/test/CodeGen/X86/live-out-reg-info.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/live-out-reg-info.ll +++ llvm/trunk/test/CodeGen/X86/live-out-reg-info.ll @@ -12,10 +12,9 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .Lcfi0: ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shrl $23, %eax -; CHECK-NEXT: testb $1, %ah -; CHECK-NEXT: jne .LBB0_2 +; CHECK-NEXT: shrl $23, %edi +; CHECK-NEXT: btl $8, %edi +; CHECK-NEXT: jb .LBB0_2 ; CHECK-NEXT: # BB#1: # %true ; CHECK-NEXT: callq qux ; CHECK-NEXT: .LBB0_2: # %false Index: llvm/trunk/test/CodeGen/X86/test-shrink.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/test-shrink.ll +++ llvm/trunk/test/CodeGen/X86/test-shrink.ll @@ -3,10 +3,10 @@ ; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefix=CHECK-32 ; CHECK-64-LABEL: g64xh: -; CHECK-64: testb $8, {{%ah|%ch}} +; CHECK-64: btl $11 ; CHECK-64: ret ; CHECK-32-LABEL: g64xh: -; CHECK-32: testb $8, %ah +; CHECK-32: btl $11 ; CHECK-32: ret define void @g64xh(i64 inreg %x) nounwind { %t = and i64 %x, 2048 @@ -37,10 +37,10 @@ ret void } ; CHECK-64-LABEL: g32xh: -; CHECK-64: testb $8, {{%ah|%ch}} +; CHECK-64: btl $11 ; CHECK-64: ret ; CHECK-32-LABEL: g32xh: -; CHECK-32: testb $8, %ah +; CHECK-32: btl $11 ; CHECK-32: ret define void @g32xh(i32 inreg %x) nounwind { %t = and i32 %x, 2048 @@ -71,10 +71,10 @@ ret void } ; CHECK-64-LABEL: g16xh: -; CHECK-64: testb $8, {{%ah|%ch}} +; CHECK-64: btl $11 ; CHECK-64: ret ; CHECK-32-LABEL: g16xh: -; CHECK-32: testb $8, %ah +; CHECK-32: btl $11 ; CHECK-32: ret define void @g16xh(i16 inreg %x) nounwind { %t = and i16 %x, 2048 Index: llvm/trunk/test/CodeGen/X86/urem-i8-constant.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/urem-i8-constant.ll +++ llvm/trunk/test/CodeGen/X86/urem-i8-constant.ll @@ -1,13 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s +; computeKnownBits determines that we don't need a mask op that is required in the general case. + define i8 @foo(i8 %tmp325) { ; CHECK-LABEL: foo: ; CHECK: # BB#0: ; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: imull $111, %ecx, %eax -; CHECK-NEXT: andl $28672, %eax # imm = 0x7000 ; CHECK-NEXT: shrl $12, %eax +; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: movb $37, %dl ; CHECK-NEXT: # kill: %AL %AL %EAX ; CHECK-NEXT: mulb %dl Index: llvm/trunk/test/CodeGen/X86/xor-icmp.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/xor-icmp.ll +++ llvm/trunk/test/CodeGen/X86/xor-icmp.ll @@ -17,15 +17,13 @@ ; ; X64-LABEL: t: ; X64: # BB#0: # %entry -; X64-NEXT: movl %edi, %eax -; X64-NEXT: xorl %esi, %eax -; X64-NEXT: testb $64, %ah -; X64-NEXT: je .LBB0_1 -; X64-NEXT: # BB#2: # %bb1 +; X64-NEXT: xorl %esi, %edi ; X64-NEXT: xorl %eax, %eax +; X64-NEXT: btl $14, %edi +; X64-NEXT: jae .LBB0_1 +; X64-NEXT: # BB#2: # %bb1 ; X64-NEXT: jmp bar # TAILCALL ; X64-NEXT: .LBB0_1: # %bb -; X64-NEXT: xorl %eax, %eax ; X64-NEXT: jmp foo # TAILCALL entry: %0 = and i32 %a, 16384