Index: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1220,6 +1220,12 @@ Sign, ShAmt)); } } + // If this is a bitcast, let computeKnownBits handle it. Only do this on a + // recursive call where Known may be useful to the caller. + if (Depth > 0) { + TLO.DAG.computeKnownBits(Op, Known, Depth); + return false; + } break; case ISD::ADD: case ISD::MUL: Index: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1773,18 +1773,15 @@ ; AVX512BW-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovq %k0, %rax ; AVX512BW-NEXT: addq %rcx, %rax -; AVX512BW-NEXT: kxorq %k0, %k0, %k0 -; AVX512BW-NEXT: kmovq %k0, %rcx -; AVX512BW-NEXT: orq %rax, %rcx ; AVX512BW-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} -; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: addq %rcx, %rax -; AVX512BW-NEXT: vpcmpleb %zmm0, %zmm1, %k0 {%k1} ; AVX512BW-NEXT: kmovq %k0, %rcx ; AVX512BW-NEXT: addq %rax, %rcx +; AVX512BW-NEXT: vpcmpleb %zmm0, %zmm1, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rdx +; AVX512BW-NEXT: addq %rcx, %rdx ; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: addq %rcx, %rax +; AVX512BW-NEXT: addq %rdx, %rax ; AVX512BW-NEXT: addq %rdi, %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1795,8 +1792,8 @@ ; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 ; AVX512F-32-NEXT: pushl %esi ; AVX512F-32-NEXT: .cfi_def_cfa_offset 12 -; AVX512F-32-NEXT: subl $68, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 80 +; AVX512F-32-NEXT: subl $60, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 72 ; AVX512F-32-NEXT: .cfi_offset %esi, -12 ; AVX512F-32-NEXT: .cfi_offset %ebx, -8 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ebx @@ -2344,10 +2341,6 @@ ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kxorq %k0, %k0, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax @@ -2362,7 +2355,7 @@ ; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl %esi, %eax ; AVX512F-32-NEXT: adcl %ebx, %edx -; AVX512F-32-NEXT: addl $68, %esp +; AVX512F-32-NEXT: addl $60, %esp ; AVX512F-32-NEXT: popl %esi ; AVX512F-32-NEXT: popl %ebx ; AVX512F-32-NEXT: vzeroupper @@ -2478,18 +2471,15 @@ ; AVX512BW-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovq %k0, %rax ; AVX512BW-NEXT: addq %rcx, %rax -; AVX512BW-NEXT: kxorq %k0, %k0, %k0 -; AVX512BW-NEXT: kmovq %k0, %rcx -; AVX512BW-NEXT: orq %rax, %rcx ; AVX512BW-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} -; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: addq %rcx, %rax -; AVX512BW-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovq %k0, %rcx ; AVX512BW-NEXT: addq %rax, %rcx +; AVX512BW-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rdx +; AVX512BW-NEXT: addq %rcx, %rdx ; AVX512BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: addq %rcx, %rax +; AVX512BW-NEXT: addq %rdx, %rax ; AVX512BW-NEXT: addq %rdi, %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2500,8 +2490,8 @@ ; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 ; AVX512F-32-NEXT: pushl %esi ; AVX512F-32-NEXT: .cfi_def_cfa_offset 12 -; AVX512F-32-NEXT: subl $68, %esp -; AVX512F-32-NEXT: .cfi_def_cfa_offset 80 +; AVX512F-32-NEXT: subl $60, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 72 ; AVX512F-32-NEXT: .cfi_offset %esi, -12 ; AVX512F-32-NEXT: .cfi_offset %ebx, -8 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ebx @@ -3049,10 +3039,6 @@ ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kxorq %k0, %k0, %k0 -; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax @@ -3067,7 +3053,7 @@ ; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl %esi, %eax ; AVX512F-32-NEXT: adcl %ebx, %edx -; AVX512F-32-NEXT: addl $68, %esp +; AVX512F-32-NEXT: addl $60, %esp ; AVX512F-32-NEXT: popl %esi ; AVX512F-32-NEXT: popl %ebx ; AVX512F-32-NEXT: vzeroupper @@ -3172,24 +3158,24 @@ ; AVX512BW-NEXT: vpcmplew %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: addl %ecx, %eax -; AVX512BW-NEXT: kxord %k0, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %ecx -; AVX512BW-NEXT: orl %eax, %ecx ; AVX512BW-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: addl %ecx, %eax -; AVX512BW-NEXT: vpcmplew %zmm0, %zmm1, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %ecx ; AVX512BW-NEXT: addl %eax, %ecx +; AVX512BW-NEXT: vpcmplew %zmm0, %zmm1, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %edx +; AVX512BW-NEXT: addl %ecx, %edx ; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: addl %ecx, %eax +; AVX512BW-NEXT: addl %edx, %eax ; AVX512BW-NEXT: addl %edi, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_cmp_w_512: ; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %esi +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %esi, -8 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} @@ -3200,19 +3186,17 @@ ; AVX512F-32-NEXT: vpcmplew %zmm1, %zmm0, %k0 {%k1} ; AVX512F-32-NEXT: kmovd %k0, %eax ; AVX512F-32-NEXT: addl %edx, %eax -; AVX512F-32-NEXT: kxord %k0, %k0, %k0 -; AVX512F-32-NEXT: kmovd %k0, %edx -; AVX512F-32-NEXT: orl %eax, %edx ; AVX512F-32-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovd %k0, %eax -; AVX512F-32-NEXT: addl %edx, %eax -; AVX512F-32-NEXT: vpcmplew %zmm0, %zmm1, %k0 {%k1} ; AVX512F-32-NEXT: kmovd %k0, %edx ; AVX512F-32-NEXT: addl %eax, %edx +; AVX512F-32-NEXT: vpcmplew %zmm0, %zmm1, %k0 {%k1} +; AVX512F-32-NEXT: kmovd %k0, %esi +; AVX512F-32-NEXT: addl %edx, %esi ; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} ; AVX512F-32-NEXT: kmovd %k0, %eax -; AVX512F-32-NEXT: addl %edx, %eax +; AVX512F-32-NEXT: addl %esi, %eax ; AVX512F-32-NEXT: addl %ecx, %eax +; AVX512F-32-NEXT: popl %esi ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask) @@ -3315,24 +3299,24 @@ ; AVX512BW-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: addl %ecx, %eax -; AVX512BW-NEXT: kxord %k0, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %ecx -; AVX512BW-NEXT: orl %eax, %ecx ; AVX512BW-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: addl %ecx, %eax -; AVX512BW-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %ecx ; AVX512BW-NEXT: addl %eax, %ecx +; AVX512BW-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %edx +; AVX512BW-NEXT: addl %ecx, %edx ; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: addl %ecx, %eax +; AVX512BW-NEXT: addl %edx, %eax ; AVX512BW-NEXT: addl %edi, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_ucmp_w_512: ; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %esi +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %esi, -8 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: kmovd %ecx, %k1 ; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} @@ -3343,19 +3327,17 @@ ; AVX512F-32-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} ; AVX512F-32-NEXT: kmovd %k0, %eax ; AVX512F-32-NEXT: addl %edx, %eax -; AVX512F-32-NEXT: kxord %k0, %k0, %k0 -; AVX512F-32-NEXT: kmovd %k0, %edx -; AVX512F-32-NEXT: orl %eax, %edx ; AVX512F-32-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: kmovd %k0, %eax -; AVX512F-32-NEXT: addl %edx, %eax -; AVX512F-32-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} ; AVX512F-32-NEXT: kmovd %k0, %edx ; AVX512F-32-NEXT: addl %eax, %edx +; AVX512F-32-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} +; AVX512F-32-NEXT: kmovd %k0, %esi +; AVX512F-32-NEXT: addl %edx, %esi ; AVX512F-32-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} ; AVX512F-32-NEXT: kmovd %k0, %eax -; AVX512F-32-NEXT: addl %edx, %eax +; AVX512F-32-NEXT: addl %esi, %eax ; AVX512F-32-NEXT: addl %ecx, %eax +; AVX512F-32-NEXT: popl %esi ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask) Index: llvm/trunk/test/CodeGen/X86/combine-and.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/combine-and.ll +++ llvm/trunk/test/CodeGen/X86/combine-and.ll @@ -291,7 +291,6 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: psrlw $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: paddb %xmm1, %xmm0 ; CHECK-NEXT: retq %1 = lshr <16 x i8> %a0, Index: llvm/trunk/test/CodeGen/X86/combine-fcopysign.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/combine-fcopysign.ll +++ llvm/trunk/test/CodeGen/X86/combine-fcopysign.ll @@ -10,20 +10,13 @@ define <4 x float> @combine_vec_fcopysign_pos_constant0(<4 x float> %x) { ; SSE-LABEL: combine_vec_fcopysign_pos_constant0: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm1 = [2.000000e+00,2.000000e+00,2.000000e+00,2.000000e+00] -; SSE-NEXT: andps {{.*}}(%rip), %xmm1 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_fcopysign_pos_constant0: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 -; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; AVX-NEXT: vandps %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> ) ret <4 x float> %1 @@ -32,19 +25,13 @@ define <4 x float> @combine_vec_fcopysign_pos_constant1(<4 x float> %x) { ; SSE-LABEL: combine_vec_fcopysign_pos_constant1: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm1 = [0.000000e+00,2.000000e+00,4.000000e+00,8.000000e+00] -; SSE-NEXT: andps {{.*}}(%rip), %xmm1 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_fcopysign_pos_constant1: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 -; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> ) ret <4 x float> %1 @@ -70,19 +57,12 @@ define <4 x float> @combine_vec_fcopysign_neg_constant0(<4 x float> %x) { ; SSE-LABEL: combine_vec_fcopysign_neg_constant0: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm1 = [-2.000000e+00,-2.000000e+00,-2.000000e+00,-2.000000e+00] -; SSE-NEXT: andps {{.*}}(%rip), %xmm1 -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: orps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_fcopysign_neg_constant0: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 -; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; AVX-NEXT: vandps %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> ) @@ -92,18 +72,12 @@ define <4 x float> @combine_vec_fcopysign_neg_constant1(<4 x float> %x) { ; SSE-LABEL: combine_vec_fcopysign_neg_constant1: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm1 = [-0.000000e+00,-2.000000e+00,-4.000000e+00,-8.000000e+00] -; SSE-NEXT: andps {{.*}}(%rip), %xmm1 -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: orps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_fcopysign_neg_constant1: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 -; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> ) @@ -113,15 +87,12 @@ define <4 x float> @combine_vec_fcopysign_fneg_fabs_sgn(<4 x float> %x, <4 x float> %y) { ; SSE-LABEL: combine_vec_fcopysign_fneg_fabs_sgn: ; SSE: # %bb.0: -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: orps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_fcopysign_fneg_fabs_sgn: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 -; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y) Index: llvm/trunk/test/CodeGen/X86/psubus.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/psubus.ll +++ llvm/trunk/test/CodeGen/X86/psubus.ll @@ -466,11 +466,11 @@ ; SSE2-NEXT: psubd %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm6 ; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 ; SSE2-NEXT: packssdw %xmm6, %xmm2 ; SSE2-NEXT: psubd %xmm1, %xmm3 @@ -494,11 +494,11 @@ ; SSSE3-NEXT: psubd %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm2, %xmm6 ; SSSE3-NEXT: pxor %xmm3, %xmm6 -; SSSE3-NEXT: pxor %xmm3, %xmm5 +; SSSE3-NEXT: por %xmm3, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 ; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: por %xmm4, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSSE3-NEXT: packssdw %xmm6, %xmm2 ; SSSE3-NEXT: psubd %xmm1, %xmm4 @@ -520,11 +520,11 @@ ; SSE41-NEXT: psubd %xmm1, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: por %xmm5, %xmm6 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 ; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: por %xmm3, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm1 ; SSE41-NEXT: packssdw %xmm1, %xmm0 ; SSE41-NEXT: psubd %xmm2, %xmm3 @@ -541,12 +541,12 @@ ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 +; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3 ; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2 @@ -564,7 +564,7 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 @@ -610,26 +610,26 @@ ; SSE2-NEXT: movdqa %xmm4, %xmm9 ; SSE2-NEXT: pxor %xmm0, %xmm9 ; SSE2-NEXT: psubd %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm9, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255] ; SSE2-NEXT: pand %xmm9, %xmm5 ; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: pxor %xmm0, %xmm7 ; SSE2-NEXT: psubd %xmm10, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm10 +; SSE2-NEXT: por %xmm0, %xmm10 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm10 ; SSE2-NEXT: pand %xmm9, %xmm10 ; SSE2-NEXT: packuswb %xmm5, %xmm10 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: pxor %xmm0, %xmm5 ; SSE2-NEXT: psubd %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 ; SSE2-NEXT: pand %xmm9, %xmm6 ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pxor %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 ; SSE2-NEXT: pand %xmm9, %xmm0 ; SSE2-NEXT: packuswb %xmm6, %xmm0 @@ -662,27 +662,27 @@ ; SSSE3-NEXT: movdqa %xmm2, %xmm9 ; SSSE3-NEXT: pxor %xmm0, %xmm9 ; SSSE3-NEXT: psubd %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: por %xmm0, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; SSSE3-NEXT: pshufb %xmm9, %xmm5 ; SSSE3-NEXT: movdqa %xmm1, %xmm6 ; SSSE3-NEXT: pxor %xmm0, %xmm6 ; SSSE3-NEXT: psubd %xmm10, %xmm1 -; SSSE3-NEXT: pxor %xmm0, %xmm10 +; SSSE3-NEXT: por %xmm0, %xmm10 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm10 ; SSSE3-NEXT: pshufb %xmm9, %xmm10 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] ; SSSE3-NEXT: movdqa %xmm4, %xmm5 ; SSSE3-NEXT: pxor %xmm0, %xmm5 ; SSSE3-NEXT: psubd %xmm7, %xmm4 -; SSSE3-NEXT: pxor %xmm0, %xmm7 +; SSSE3-NEXT: por %xmm0, %xmm7 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = ; SSSE3-NEXT: pshufb %xmm5, %xmm7 ; SSSE3-NEXT: movdqa %xmm3, %xmm6 ; SSSE3-NEXT: pxor %xmm0, %xmm6 -; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm8, %xmm0 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 ; SSSE3-NEXT: pshufb %xmm5, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] @@ -713,27 +713,27 @@ ; SSE41-NEXT: movdqa %xmm4, %xmm7 ; SSE41-NEXT: pxor %xmm5, %xmm7 ; SSE41-NEXT: psubd %xmm6, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: por %xmm5, %xmm6 ; SSE41-NEXT: pcmpgtd %xmm7, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm10 = ; SSE41-NEXT: pshufb %xmm10, %xmm6 ; SSE41-NEXT: movdqa %xmm3, %xmm7 ; SSE41-NEXT: pxor %xmm5, %xmm7 ; SSE41-NEXT: psubd %xmm9, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm9 +; SSE41-NEXT: por %xmm5, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm7, %xmm9 ; SSE41-NEXT: pshufb %xmm10, %xmm9 ; SSE41-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] ; SSE41-NEXT: movdqa %xmm1, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: psubd %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; SSE41-NEXT: pshufb %xmm6, %xmm0 ; SSE41-NEXT: movdqa %xmm2, %xmm7 ; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: pxor %xmm8, %xmm5 +; SSE41-NEXT: por %xmm8, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm7, %xmm5 ; SSE41-NEXT: pshufb %xmm6, %xmm5 ; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] @@ -762,18 +762,18 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 ; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm3 -; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm4 +; AVX1-NEXT: vpor %xmm6, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm4 -; AVX1-NEXT: vpxor %xmm6, %xmm10, %xmm5 +; AVX1-NEXT: vpor %xmm6, %xmm10, %xmm5 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm11 ; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm4 -; AVX1-NEXT: vpxor %xmm6, %xmm9, %xmm5 +; AVX1-NEXT: vpor %xmm6, %xmm9, %xmm5 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm3 -; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vpor %xmm6, %xmm8, %xmm6 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3 ; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpacksswb %xmm11, %xmm3, %xmm3 @@ -800,12 +800,12 @@ ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm6 +; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm6 ; AVX2-NEXT: vpcmpgtd %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 ; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm6 -; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm4 +; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm4 ; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm4 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 @@ -853,11 +853,11 @@ ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: psubd %xmm2, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: psubd %xmm1, %xmm0 @@ -879,11 +879,11 @@ ; SSSE3-NEXT: movdqa %xmm0, %xmm5 ; SSSE3-NEXT: psubd %xmm2, %xmm0 ; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 ; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: por %xmm3, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 ; SSSE3-NEXT: packssdw %xmm5, %xmm4 ; SSSE3-NEXT: psubd %xmm1, %xmm3 @@ -904,11 +904,11 @@ ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psubd %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: por %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm1, %xmm5 ; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: por %xmm3, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 ; SSE41-NEXT: packssdw %xmm4, %xmm5 ; SSE41-NEXT: psubd %xmm2, %xmm3 @@ -926,11 +926,11 @@ ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5 +; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2 @@ -948,7 +948,7 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 @@ -987,11 +987,11 @@ ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: psubd %xmm2, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: psubd %xmm1, %xmm0 @@ -1013,11 +1013,11 @@ ; SSSE3-NEXT: movdqa %xmm0, %xmm5 ; SSSE3-NEXT: psubd %xmm2, %xmm0 ; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 ; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: por %xmm3, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 ; SSSE3-NEXT: packssdw %xmm5, %xmm4 ; SSSE3-NEXT: psubd %xmm1, %xmm3 @@ -1038,11 +1038,11 @@ ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psubd %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: por %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm1, %xmm5 ; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: por %xmm3, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 ; SSE41-NEXT: packssdw %xmm4, %xmm5 ; SSE41-NEXT: psubd %xmm2, %xmm3 @@ -1060,11 +1060,11 @@ ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5 +; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2 @@ -1082,7 +1082,7 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2