Index: lib/CodeGen/TwoAddressInstructionPass.cpp =================================================================== --- lib/CodeGen/TwoAddressInstructionPass.cpp +++ lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1591,6 +1591,10 @@ if (MO.isKill()) { MO.setIsKill(false); RemovedKillFlag = true; + // Propagate SrcRegMap. We do this only when the operand is killed since + // otherwise this copy is likely unavoidable and we don't want to + // propagate a physical register through it. + SrcRegMap[RegA] = RegB; } // Make sure regA is a legal regclass for the SrcIdx operand. @@ -1602,9 +1606,6 @@ // by SubRegB is compatible with RegA with no subregister. So regardless of // whether the dest oper writes a subreg, the source oper should not. MO.setSubReg(0); - - // Propagate SrcRegMap. - SrcRegMap[RegA] = RegB; } if (AllUsesCopied) { Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -151,10 +151,9 @@ ; CHECK-LABEL: mand16: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: xorl %esi, %ecx -; CHECK-NEXT: andl %esi, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: andl %esi, %edi +; CHECK-NEXT: orl %edi, %eax ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq ; Index: test/CodeGen/X86/avx512-regcall-NoMask.ll =================================================================== --- test/CodeGen/X86/avx512-regcall-NoMask.ll +++ test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -924,45 +924,46 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: subl $20, %esp ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %esi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %ebx ; X32-NEXT: subl %ecx, %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl %esi, %ebp -; X32-NEXT: subl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: imull %ebp, %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: subl %edi, %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: imull %ebp, %ecx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: movl %edi, %ebp ; X32-NEXT: subl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: imull %ebx, %ebp +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: subl %esi, %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: subl {{[0-9]+}}(%esp), %eax -; X32-NEXT: imull %ebp, %eax -; X32-NEXT: addl %eax, %ebx +; X32-NEXT: imull %ebx, %eax +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %esi +; X32-NEXT: subl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl %ebx, %edx +; X32-NEXT: subl {{[0-9]+}}(%esp), %edx +; X32-NEXT: imull %esi, %edx +; X32-NEXT: addl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl (%esp), %ebp # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp ; X32-NEXT: addl {{[0-9]+}}(%esp), %edi -; X32-NEXT: addl {{[0-9]+}}(%esp), %esi -; X32-NEXT: imull %eax, %esi -; X32-NEXT: addl {{[0-9]+}}(%esp), %edx -; X32-NEXT: imull %ebp, %edx -; X32-NEXT: addl %esi, %edx +; X32-NEXT: imull %eax, %edi ; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: imull %edi, %ecx -; X32-NEXT: addl %edx, %ecx +; X32-NEXT: imull %esi, %ecx +; X32-NEXT: addl %edi, %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: imull %ebp, %ebx ; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: addl %edx, %ebx ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: addl $20, %esp ; X32-NEXT: popl %ebx @@ -974,35 +975,40 @@ ; WIN64-NEXT: pushq %r13 ; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx -; WIN64-NEXT: movl %eax, %r13d -; WIN64-NEXT: subl %ecx, %eax -; WIN64-NEXT: movl %edx, %ebp -; WIN64-NEXT: subl %edi, %ebp -; WIN64-NEXT: movl %r9d, %ebx -; WIN64-NEXT: subl %r10d, %ebx -; WIN64-NEXT: imull %ebx, %eax -; WIN64-NEXT: movl %r11d, %ebx -; WIN64-NEXT: subl %r12d, %ebx -; WIN64-NEXT: imull %ebp, %ebx -; WIN64-NEXT: movl %esi, %ebp -; WIN64-NEXT: subl %r8d, %ebp -; WIN64-NEXT: addl %ebx, %eax +; WIN64-NEXT: subq $4, %rsp +; WIN64-NEXT: movl %ecx, %ebp +; WIN64-NEXT: movl %ecx, (%rsp) # 4-byte Spill +; WIN64-NEXT: movl %r11d, %ecx +; WIN64-NEXT: movl %eax, %ebx +; WIN64-NEXT: subl %ebp, %ebx +; WIN64-NEXT: movl %edx, %r13d +; WIN64-NEXT: subl %edi, %r13d +; WIN64-NEXT: movl %r9d, %ebp +; WIN64-NEXT: subl %r10d, %ebp +; WIN64-NEXT: imull %ebx, %ebp +; WIN64-NEXT: subl %r12d, %r11d +; WIN64-NEXT: imull %r13d, %r11d +; WIN64-NEXT: movl %esi, %r13d +; WIN64-NEXT: subl %r8d, %r13d +; WIN64-NEXT: addl %ebp, %r11d ; WIN64-NEXT: movl %r14d, %ebx ; WIN64-NEXT: subl %r15d, %ebx -; WIN64-NEXT: imull %ebp, %ebx -; WIN64-NEXT: addl %ebx, %eax -; WIN64-NEXT: addl %ecx, %r13d +; WIN64-NEXT: imull %r13d, %ebx +; WIN64-NEXT: addl %r11d, %ebx +; WIN64-NEXT: addl (%rsp), %eax # 4-byte Folded Reload ; WIN64-NEXT: addl %edi, %edx ; WIN64-NEXT: addl %r8d, %esi ; WIN64-NEXT: addl %r10d, %r9d -; WIN64-NEXT: imull %r13d, %r9d -; WIN64-NEXT: addl %r12d, %r11d -; WIN64-NEXT: imull %edx, %r11d -; WIN64-NEXT: addl %r9d, %r11d +; WIN64-NEXT: imull %eax, %r9d +; WIN64-NEXT: addl %r12d, %ecx +; WIN64-NEXT: imull %edx, %ecx +; WIN64-NEXT: addl %r9d, %ecx ; WIN64-NEXT: addl %r15d, %r14d ; WIN64-NEXT: imull %esi, %r14d -; WIN64-NEXT: addl %r11d, %r14d -; WIN64-NEXT: addl %r14d, %eax +; WIN64-NEXT: addl %ecx, %r14d +; WIN64-NEXT: addl %r14d, %ebx +; WIN64-NEXT: movl %ebx, %eax +; WIN64-NEXT: addq $4, %rsp ; WIN64-NEXT: popq %rbx ; WIN64-NEXT: popq %rbp ; WIN64-NEXT: popq %r13 @@ -1012,36 +1018,38 @@ ; LINUXOSX64: # %bb.0: ; LINUXOSX64-NEXT: pushq %rbp ; LINUXOSX64-NEXT: pushq %rbx +; LINUXOSX64-NEXT: movl %ecx, %r11d +; LINUXOSX64-NEXT: movl %r13d, %ecx ; LINUXOSX64-NEXT: movl %eax, %r10d -; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; LINUXOSX64-NEXT: subl %ecx, %eax +; LINUXOSX64-NEXT: subl %r11d, %r10d ; LINUXOSX64-NEXT: movl %edx, %ebx ; LINUXOSX64-NEXT: subl %edi, %ebx ; LINUXOSX64-NEXT: movl %r9d, %ebp ; LINUXOSX64-NEXT: subl %r12d, %ebp -; LINUXOSX64-NEXT: imull %ebp, %eax -; LINUXOSX64-NEXT: movl %r13d, %ebp -; LINUXOSX64-NEXT: subl %r14d, %ebp -; LINUXOSX64-NEXT: imull %ebx, %ebp +; LINUXOSX64-NEXT: imull %r10d, %ebp +; LINUXOSX64-NEXT: subl %r14d, %r13d +; LINUXOSX64-NEXT: imull %ebx, %r13d ; LINUXOSX64-NEXT: movl %esi, %ebx ; LINUXOSX64-NEXT: subl %r8d, %ebx -; LINUXOSX64-NEXT: addl %ebp, %eax -; LINUXOSX64-NEXT: movl %r15d, %ebp -; LINUXOSX64-NEXT: subl %r11d, %ebp -; LINUXOSX64-NEXT: imull %ebx, %ebp -; LINUXOSX64-NEXT: addl %ebp, %eax -; LINUXOSX64-NEXT: addl %ecx, %r10d +; LINUXOSX64-NEXT: addl %ebp, %r13d +; LINUXOSX64-NEXT: movl %r15d, %r10d +; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %ebp +; LINUXOSX64-NEXT: subl %ebp, %r10d +; LINUXOSX64-NEXT: imull %ebx, %r10d +; LINUXOSX64-NEXT: addl %r13d, %r10d +; LINUXOSX64-NEXT: addl %r11d, %eax ; LINUXOSX64-NEXT: addl %edi, %edx ; LINUXOSX64-NEXT: addl %r8d, %esi ; LINUXOSX64-NEXT: addl %r12d, %r9d -; LINUXOSX64-NEXT: imull %r10d, %r9d -; LINUXOSX64-NEXT: addl %r14d, %r13d -; LINUXOSX64-NEXT: imull %edx, %r13d -; LINUXOSX64-NEXT: addl %r9d, %r13d -; LINUXOSX64-NEXT: addl %r11d, %r15d +; LINUXOSX64-NEXT: imull %eax, %r9d +; LINUXOSX64-NEXT: addl %r14d, %ecx +; LINUXOSX64-NEXT: imull %edx, %ecx +; LINUXOSX64-NEXT: addl %r9d, %ecx +; LINUXOSX64-NEXT: addl %ebp, %r15d ; LINUXOSX64-NEXT: imull %esi, %r15d -; LINUXOSX64-NEXT: addl %r13d, %r15d -; LINUXOSX64-NEXT: addl %r15d, %eax +; LINUXOSX64-NEXT: addl %ecx, %r15d +; LINUXOSX64-NEXT: addl %r15d, %r10d +; LINUXOSX64-NEXT: movl %r10d, %eax ; LINUXOSX64-NEXT: popq %rbx ; LINUXOSX64-NEXT: popq %rbp ; LINUXOSX64-NEXT: retq Index: test/CodeGen/X86/avx512-schedule.ll =================================================================== --- test/CodeGen/X86/avx512-schedule.ll +++ test/CodeGen/X86/avx512-schedule.ll @@ -6792,20 +6792,18 @@ ; GENERIC-LABEL: mand16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33] -; GENERIC-NEXT: movl %edi, %ecx # sched: [1:0.33] -; GENERIC-NEXT: xorl %esi, %ecx # sched: [1:0.33] -; GENERIC-NEXT: andl %esi, %eax # sched: [1:0.33] -; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33] +; GENERIC-NEXT: xorl %esi, %eax # sched: [1:0.33] +; GENERIC-NEXT: andl %esi, %edi # sched: [1:0.33] +; GENERIC-NEXT: orl %edi, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def $ax killed $ax killed $eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mand16: ; SKX: # %bb.0: ; SKX-NEXT: movl %edi, %eax # sched: [1:0.25] -; SKX-NEXT: movl %edi, %ecx # sched: [1:0.25] -; SKX-NEXT: xorl %esi, %ecx # sched: [1:0.25] -; SKX-NEXT: andl %esi, %eax # sched: [1:0.25] -; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25] +; SKX-NEXT: xorl %esi, %eax # sched: [1:0.25] +; SKX-NEXT: andl %esi, %edi # sched: [1:0.25] +; SKX-NEXT: orl %edi, %eax # sched: [1:0.25] ; SKX-NEXT: # kill: def $ax killed $ax killed $eax ; SKX-NEXT: retq # sched: [7:1.00] %ma = bitcast i16 %x to <16 x i1> Index: test/CodeGen/X86/avx512bw-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512bw-mask-op.ll +++ test/CodeGen/X86/avx512bw-mask-op.ll @@ -79,10 +79,9 @@ ; CHECK-LABEL: mand32: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: andl %esi, %ecx -; CHECK-NEXT: xorl %esi, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: andl %esi, %eax +; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: orl %edi, %eax ; CHECK-NEXT: retq %ma = bitcast i32 %x to <32 x i1> %mb = bitcast i32 %y to <32 x i1> @@ -116,10 +115,9 @@ ; CHECK-LABEL: mand64: ; CHECK: ## %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq %rdi, %rcx -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: xorq %rsi, %rax -; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: xorq %rsi, %rdi +; CHECK-NEXT: orq %rdi, %rax ; CHECK-NEXT: retq %ma = bitcast i64 %x to <64 x i1> %mb = bitcast i64 %y to <64 x i1> Index: test/CodeGen/X86/avx512dq-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512dq-mask-op.ll +++ test/CodeGen/X86/avx512dq-mask-op.ll @@ -33,10 +33,9 @@ ; CHECK-LABEL: mand8: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: xorl %esi, %ecx -; CHECK-NEXT: andl %esi, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: andl %esi, %edi +; CHECK-NEXT: orl %edi, %eax ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %ma = bitcast i8 %x to <8 x i1> Index: test/CodeGen/X86/bitreverse.ll =================================================================== --- test/CodeGen/X86/bitreverse.ll +++ test/CodeGen/X86/bitreverse.ll @@ -341,21 +341,19 @@ ; ; X64-LABEL: test_bitreverse_i8: ; X64: # %bb.0: +; X64-NEXT: rolb $4, %dil ; X64-NEXT: movl %edi, %eax -; X64-NEXT: rolb $4, %al -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andb $51, %cl -; X64-NEXT: shlb $2, %cl -; X64-NEXT: andb $-52, %al -; X64-NEXT: shrb $2, %al -; X64-NEXT: orb %cl, %al -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andb $85, %cl -; X64-NEXT: addb %cl, %cl -; X64-NEXT: andb $-86, %al -; X64-NEXT: shrb %al -; X64-NEXT: orb %cl, %al -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: andb $51, %al +; X64-NEXT: shlb $2, %al +; X64-NEXT: andb $-52, %dil +; X64-NEXT: shrb $2, %dil +; X64-NEXT: orb %al, %dil +; X64-NEXT: movl %edi, %eax +; X64-NEXT: andb $85, %al +; X64-NEXT: addb %al, %al +; X64-NEXT: andb $-86, %dil +; X64-NEXT: shrb %dil +; X64-NEXT: orb %dil, %al ; X64-NEXT: retq %b = call i8 @llvm.bitreverse.i8(i8 %a) ret i8 %b @@ -385,22 +383,20 @@ ; ; X64-LABEL: test_bitreverse_i4: ; X64: # %bb.0: +; X64-NEXT: rolb $4, %dil ; X64-NEXT: movl %edi, %eax -; X64-NEXT: rolb $4, %al -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andb $51, %cl -; X64-NEXT: shlb $2, %cl -; X64-NEXT: andb $-52, %al -; X64-NEXT: shrb $2, %al -; X64-NEXT: orb %cl, %al -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andb $80, %cl -; X64-NEXT: addb %cl, %cl -; X64-NEXT: andb $-96, %al -; X64-NEXT: shrb %al -; X64-NEXT: orb %cl, %al +; X64-NEXT: andb $51, %al +; X64-NEXT: shlb $2, %al +; X64-NEXT: andb $-52, %dil +; X64-NEXT: shrb $2, %dil +; X64-NEXT: orb %al, %dil +; X64-NEXT: movl %edi, %eax +; X64-NEXT: andb $80, %al +; X64-NEXT: addb %al, %al +; X64-NEXT: andb $-96, %dil +; X64-NEXT: shrb %dil +; X64-NEXT: orb %dil, %al ; X64-NEXT: shrb $4, %al -; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %b = call i4 @llvm.bitreverse.i4(i4 %a) ret i4 %b Index: test/CodeGen/X86/bswap_tree2.ll =================================================================== --- test/CodeGen/X86/bswap_tree2.ll +++ test/CodeGen/X86/bswap_tree2.ll @@ -24,17 +24,16 @@ ; ; CHECK64-LABEL: test1: ; CHECK64: # %bb.0: -; CHECK64-NEXT: movl %edi, %eax ; CHECK64-NEXT: movl %edi, %ecx ; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000 -; CHECK64-NEXT: movl %edi, %edx -; CHECK64-NEXT: orl $-16777216, %edx # imm = 0xFF000000 +; CHECK64-NEXT: movl %edi, %eax +; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 ; CHECK64-NEXT: shll $8, %ecx -; CHECK64-NEXT: shrl $8, %edx -; CHECK64-NEXT: orl %ecx, %edx -; CHECK64-NEXT: bswapl %eax -; CHECK64-NEXT: shrl $16, %eax -; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: shrl $8, %eax +; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: bswapl %edi +; CHECK64-NEXT: shrl $16, %edi +; CHECK64-NEXT: orl %edi, %eax ; CHECK64-NEXT: retq %byte0 = and i32 %x, 255 ; 0x000000ff %byte1 = and i32 %x, 65280 ; 0x0000ff00 @@ -81,7 +80,7 @@ ; CHECK64-NEXT: andl $-16777216, %edi # imm = 0xFF000000 ; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000 ; CHECK64-NEXT: orl %edi, %eax -; CHECK64-NEXT: leal (%rax,%rcx), %eax +; CHECK64-NEXT: orl %ecx, %eax ; CHECK64-NEXT: retq %byte1 = lshr i32 %x, 8 %byte0 = shl i32 %x, 8 Index: test/CodeGen/X86/bypass-slow-division-32.ll =================================================================== --- test/CodeGen/X86/bypass-slow-division-32.ll +++ test/CodeGen/X86/bypass-slow-division-32.ll @@ -143,7 +143,7 @@ ; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: shrl $31, %eax ; CHECK-NEXT: sarl $3, %edx -; CHECK-NEXT: leal (%edx,%eax), %eax +; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retl %resultdiv = sdiv i32 %a, 33 ret i32 %resultdiv Index: test/CodeGen/X86/combine-fcopysign.ll =================================================================== --- test/CodeGen/X86/combine-fcopysign.ll +++ test/CodeGen/X86/combine-fcopysign.ll @@ -193,38 +193,36 @@ define <4 x double> @combine_vec_fcopysign_fpext_sgn(<4 x double> %x, <4 x float> %y) { ; SSE-LABEL: combine_vec_fcopysign_fpext_sgn: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: cvtss2sd %xmm2, %xmm4 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtss2sd %xmm2, %xmm0 ; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm7 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: andps %xmm7, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1,2,3] +; SSE-NEXT: movaps {{.*#+}} xmm6 +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: andps %xmm6, %xmm7 ; SSE-NEXT: movaps {{.*#+}} xmm8 = [-0.000000e+00,-0.000000e+00] -; SSE-NEXT: andps %xmm8, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: andps %xmm7, %xmm0 -; SSE-NEXT: xorps %xmm4, %xmm4 -; SSE-NEXT: cvtss2sd %xmm5, %xmm4 -; SSE-NEXT: andps %xmm8, %xmm4 -; SSE-NEXT: orps %xmm0, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: andps %xmm7, %xmm0 -; SSE-NEXT: cvtss2sd %xmm3, %xmm3 -; SSE-NEXT: andps %xmm8, %xmm3 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: andps %xmm7, %xmm1 -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtss2sd %xmm6, %xmm0 ; SSE-NEXT: andps %xmm8, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: orps %xmm7, %xmm0 +; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: andps %xmm6, %xmm3 +; SSE-NEXT: cvtss2sd %xmm5, %xmm5 +; SSE-NEXT: andps %xmm8, %xmm5 +; SSE-NEXT: orps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: andps %xmm6, %xmm3 +; SSE-NEXT: cvtss2sd %xmm4, %xmm4 +; SSE-NEXT: andps %xmm8, %xmm4 +; SSE-NEXT: orps %xmm3, %xmm4 +; SSE-NEXT: andps %xmm6, %xmm1 +; SSE-NEXT: cvtss2sd %xmm2, %xmm2 +; SSE-NEXT: andps %xmm8, %xmm2 +; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_fcopysign_fpext_sgn: @@ -245,36 +243,37 @@ define <4 x float> @combine_vec_fcopysign_fptrunc_sgn(<4 x float> %x, <4 x double> %y) { ; SSE-LABEL: combine_vec_fcopysign_fptrunc_sgn: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: movaps {{.*#+}} xmm5 -; SSE-NEXT: andps %xmm5, %xmm0 -; SSE-NEXT: cvtsd2ss %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: andps %xmm5, %xmm6 +; SSE-NEXT: cvtsd2ss %xmm1, %xmm3 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00] -; SSE-NEXT: andps %xmm4, %xmm6 -; SSE-NEXT: orps %xmm6, %xmm0 -; SSE-NEXT: movshdup {{.*#+}} xmm6 = xmm3[1,1,3,3] +; SSE-NEXT: andps %xmm4, %xmm3 +; SSE-NEXT: orps %xmm6, %xmm3 +; SSE-NEXT: movshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] ; SSE-NEXT: andps %xmm5, %xmm6 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: cvtsd2ss %xmm1, %xmm1 ; SSE-NEXT: andps %xmm4, %xmm1 ; SSE-NEXT: orps %xmm6, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: andps %xmm5, %xmm1 ; SSE-NEXT: xorps %xmm6, %xmm6 ; SSE-NEXT: cvtsd2ss %xmm2, %xmm6 ; SSE-NEXT: andps %xmm4, %xmm6 ; SSE-NEXT: orps %xmm1, %xmm6 -; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] -; SSE-NEXT: andps %xmm5, %xmm3 +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],xmm6[0],xmm3[3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-NEXT: andps %xmm5, %xmm0 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsd2ss %xmm2, %xmm1 ; SSE-NEXT: andps %xmm4, %xmm1 -; SSE-NEXT: orps %xmm3, %xmm1 -; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE-NEXT: orps %xmm0, %xmm1 +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[0] +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_fcopysign_fptrunc_sgn: Index: test/CodeGen/X86/combine-mul.ll =================================================================== --- test/CodeGen/X86/combine-mul.ll +++ test/CodeGen/X86/combine-mul.ll @@ -82,12 +82,11 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psllq $1, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psllq $4, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psllq $4, %xmm2 ; SSE-NEXT: psllq $2, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_mul_pow2c: Index: test/CodeGen/X86/combine-sdiv.ll =================================================================== --- test/CodeGen/X86/combine-sdiv.ll +++ test/CodeGen/X86/combine-sdiv.ll @@ -275,15 +275,15 @@ ; SSE41-LABEL: combine_vec_sdiv_by_pos1: ; SSE41: # %bb.0: ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $4, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrld $2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrld $3, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: psrld $4, %xmm0 -; SSE41-NEXT: psrld $2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pos1: @@ -320,9 +320,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrad $31, %xmm1 ; SSE-NEXT: psrld $30, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: psrad $2, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: psrad $2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_sdiv_by_pow2a: @@ -562,14 +561,13 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psraw $4, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3],xmm1[4],xmm2[5,6],xmm1[7] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psraw $2, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; SSE41-NEXT: movdqa %xmm3, %xmm1 -; SSE41-NEXT: psraw $1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psraw $2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16: @@ -701,29 +699,27 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: psraw $4, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3],xmm2[4],xmm4[5,6],xmm2[7] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: psraw $2, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3],xmm2[4,5],xmm5[6],xmm2[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psraw $15, %xmm0 -; SSE41-NEXT: pmulhuw %xmm3, %xmm0 -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $15, %xmm2 +; SSE41-NEXT: pmulhuw %xmm3, %xmm2 +; SSE41-NEXT: paddw %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psraw $4, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3],xmm0[4],xmm3[5,6],xmm0[7] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: psraw $2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3],xmm2[4],xmm3[5,6],xmm2[7] +; SSE41-NEXT: movdqa %xmm3, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psraw $1, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6],xmm3[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16: @@ -929,66 +925,63 @@ ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psraw $15, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1,4,2,16,8,32,64,2] -; SSE41-NEXT: pmulhuw %xmm5, %xmm0 -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psraw $15, %xmm5 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,4,2,16,8,32,64,2] +; SSE41-NEXT: pmulhuw %xmm4, %xmm5 +; SSE41-NEXT: paddw %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm6 ; SSE41-NEXT: psraw $4, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3],xmm0[4],xmm6[5,6],xmm0[7] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: psraw $2, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: psraw $1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3],xmm0[4,5],xmm7[6],xmm0[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: psraw $15, %xmm1 -; SSE41-NEXT: pmulhuw %xmm5, %xmm1 -; SSE41-NEXT: paddw %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3],xmm5[4],xmm6[5,6],xmm5[7] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: psraw $2, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5],xmm5[6],xmm6[7] +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: psraw $1, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: psraw $15, %xmm5 +; SSE41-NEXT: pmulhuw %xmm4, %xmm5 +; SSE41-NEXT: paddw %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm6 ; SSE41-NEXT: psraw $4, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3],xmm1[4],xmm6[5,6],xmm1[7] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: psraw $2, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] -; SSE41-NEXT: movdqa %xmm7, %xmm1 -; SSE41-NEXT: psraw $1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4,5],xmm7[6],xmm1[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psraw $15, %xmm4 -; SSE41-NEXT: pmulhuw %xmm5, %xmm4 -; SSE41-NEXT: paddw %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3],xmm5[4],xmm6[5,6],xmm5[7] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: psraw $2, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5],xmm5[6],xmm6[7] +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: psraw $1, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: psraw $15, %xmm5 +; SSE41-NEXT: pmulhuw %xmm4, %xmm5 +; SSE41-NEXT: paddw %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm6 ; SSE41-NEXT: psraw $4, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3],xmm4[4],xmm6[5,6],xmm4[7] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: psraw $2, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] -; SSE41-NEXT: movdqa %xmm7, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3],xmm5[4],xmm6[5,6],xmm5[7] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: psraw $2, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5],xmm5[6],xmm6[7] +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: psraw $1, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: psraw $15, %xmm5 +; SSE41-NEXT: pmulhuw %xmm4, %xmm5 +; SSE41-NEXT: paddw %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: psraw $4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5,6],xmm5[7] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: psraw $2, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; SSE41-NEXT: movdqa %xmm5, %xmm4 ; SSE41-NEXT: psraw $1, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: psraw $15, %xmm2 -; SSE41-NEXT: pmulhuw %xmm5, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: psraw $4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3],xmm2[4],xmm5[5,6],xmm2[7] -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: psraw $2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3],xmm2[4],xmm5[5],xmm2[6],xmm5[7] -; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: psraw $1, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16: @@ -1277,25 +1270,25 @@ ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrld $28, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: psrld $30, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm0 +; SSE41-NEXT: psrld $29, %xmm2 ; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] -; SSE41-NEXT: paddd %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; SSE41-NEXT: paddd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: psrad $4, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm2, %xmm5 ; SSE41-NEXT: psrad $2, %xmm5 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; SSE41-NEXT: psrad $3, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm4 @@ -1314,8 +1307,7 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; SSE41-NEXT: psrad $3, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32: @@ -1469,86 +1461,83 @@ ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: psrld $28, %xmm5 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: psrld $30, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7] -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: psrad $4, %xmm5 -; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: psrad $31, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: psrld $28, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: psrld $30, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm4[4,5,6,7] +; SSE41-NEXT: psrld $29, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] +; SSE41-NEXT: paddd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: psrad $4, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm7 ; SSE41-NEXT: psrad $2, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: psrad $3, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: psrld $28, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm7 +; SSE41-NEXT: psrad $31, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: psrld $28, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm7 ; SSE41-NEXT: psrld $30, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7] -; SSE41-NEXT: paddd %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: psrad $4, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm7 +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: psrld $29, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7] +; SSE41-NEXT: paddd %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: psrad $4, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm7 ; SSE41-NEXT: psrad $2, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: psrad $3, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: psrld $28, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: psrad $31, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: psrld $28, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm7 ; SSE41-NEXT: psrld $30, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7] -; SSE41-NEXT: paddd %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: psrad $4, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: psrld $29, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7] +; SSE41-NEXT: paddd %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: psrad $4, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm7 ; SSE41-NEXT: psrad $2, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: psrad $3, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm3, %xmm5 ; SSE41-NEXT: psrad $31, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: psrld $28, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: psrld $28, %xmm6 ; SSE41-NEXT: movdqa %xmm5, %xmm7 ; SSE41-NEXT: psrld $30, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] ; SSE41-NEXT: psrld $29, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7] ; SSE41-NEXT: paddd %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: psrad $4, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: psrad $4, %xmm4 ; SSE41-NEXT: movdqa %xmm5, %xmm6 ; SSE41-NEXT: psrad $2, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm4[4,5,6,7] ; SSE41-NEXT: psrad $3, %xmm5 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32: @@ -2238,28 +2227,29 @@ ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrld $28, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrld $28, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrld $30, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: psrld $29, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; SSE41-NEXT: paddd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrad $4, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: psrad $2, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: psrad $3, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; SSE41-NEXT: psubd %xmm0, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: psrad $3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE41-NEXT: psubd %xmm2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: @@ -2602,15 +2592,15 @@ ; ; SSE41-LABEL: non_splat_minus_one_divisor_2: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $31, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: psrad $1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: psubd %xmm1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrld $31, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: paddd %xmm0, %xmm2 +; SSE41-NEXT: psrad $1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: psubd %xmm2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -2659,8 +2649,7 @@ ; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrlw $15, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_sdiv_nonuniform: @@ -3089,15 +3078,15 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psraw $4, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psraw $2, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4],xmm3[5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm3, %xmm1 -; SSE41-NEXT: psraw $1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5],xmm3[6],xmm1[7] +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psraw $2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4],xmm1[5],xmm2[6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6],xmm2[7] ; SSE41-NEXT: psrlw $15, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] ; SSE41-NEXT: paddw %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq Index: test/CodeGen/X86/combine-sra.ll =================================================================== --- test/CodeGen/X86/combine-sra.ll +++ test/CodeGen/X86/combine-sra.ll @@ -187,15 +187,15 @@ ; SSE-LABEL: combine_vec_ashr_trunc_lshr: ; SSE: # %bb.0: ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: psrad $3, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: psrad $1, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: psrad $2, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE-NEXT: psrad $3, %xmm0 -; SSE-NEXT: psrad $1, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; SSE-NEXT: retq ; ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr: @@ -227,15 +227,15 @@ ; SSE-LABEL: combine_vec_ashr_trunc_ashr: ; SSE: # %bb.0: ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: psrad $3, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: psrad $1, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: psrad $2, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE-NEXT: psrad $3, %xmm0 -; SSE-NEXT: psrad $1, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; SSE-NEXT: retq ; ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr: Index: test/CodeGen/X86/combine-srem.ll =================================================================== --- test/CodeGen/X86/combine-srem.ll +++ test/CodeGen/X86/combine-srem.ll @@ -55,13 +55,12 @@ define i32 @combine_srem_by_minsigned(i32 %x) { ; CHECK-LABEL: combine_srem_by_minsigned: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: sarl $31, %eax ; CHECK-NEXT: shrl %eax ; CHECK-NEXT: addl %edi, %eax ; CHECK-NEXT: andl $-2147483648, %eax # imm = 0x80000000 -; CHECK-NEXT: leal (%rax,%rdi), %eax +; CHECK-NEXT: addl %edi, %eax ; CHECK-NEXT: retq %1 = srem i32 %x, -2147483648 ret i32 %1 @@ -382,8 +381,7 @@ ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: combine_vec_srem_by_pow2b_neg: Index: test/CodeGen/X86/combine-srl.ll =================================================================== --- test/CodeGen/X86/combine-srl.ll +++ test/CodeGen/X86/combine-srl.ll @@ -357,33 +357,32 @@ ; SSE-LABEL: combine_vec_lshr_lzcnt_bit1: ; SSE: # %bb.0: ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pshufb %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrlw $4, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pshufb %xmm1, %xmm3 -; SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: paddb %xmm3, %xmm1 +; SSE-NEXT: pshufb %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrlw $4, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pshufb %xmm2, %xmm3 +; SSE-NEXT: pcmpeqb %xmm1, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: paddb %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE-NEXT: pcmpeqb %xmm1, %xmm3 ; SSE-NEXT: psrlw $8, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: psrlw $8, %xmm1 -; SSE-NEXT: paddw %xmm3, %xmm1 -; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: psrlw $8, %xmm2 +; SSE-NEXT: paddw %xmm3, %xmm2 +; SSE-NEXT: pcmpeqw %xmm1, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: psrld $5, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: psrld $5, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_lshr_lzcnt_bit1: Index: test/CodeGen/X86/combine-udiv.ll =================================================================== --- test/CodeGen/X86/combine-udiv.ll +++ test/CodeGen/X86/combine-udiv.ll @@ -283,15 +283,15 @@ ; ; SSE41-LABEL: combine_vec_udiv_by_pow2b: ; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $4, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrld $2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrld $3, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: psrld $4, %xmm0 -; SSE41-NEXT: psrld $2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_udiv_by_pow2b: @@ -662,11 +662,11 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) { ; SSE2-LABEL: combine_vec_udiv_nonuniform2: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1 @@ -677,10 +677,9 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_vec_udiv_nonuniform2: @@ -850,38 +849,37 @@ define <8 x i16> @pr38477(<8 x i16> %a0) { ; SSE2-LABEL: pr38477: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4957,57457,4103,16385,35545,2048,2115] -; SSE2-NEXT: pmulhuw %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psubw %xmm2, %xmm1 -; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1 -; SSE2-NEXT: paddw %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4957,57457,4103,16385,35545,2048,2115] +; SSE2-NEXT: pmulhuw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psubw %xmm1, %xmm2 +; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm2 +; SSE2-NEXT: paddw %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,0,65535] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: pr38477: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4957,57457,4103,16385,35545,2048,2115] -; SSE41-NEXT: pmulhuw %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psubw %xmm2, %xmm1 -; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm1 -; SSE41-NEXT: paddw %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = -; SSE41-NEXT: pmulhuw %xmm1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5],xmm1[6],xmm2[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4957,57457,4103,16385,35545,2048,2115] +; SSE41-NEXT: pmulhuw %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psubw %xmm1, %xmm2 +; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm2 +; SSE41-NEXT: paddw %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = +; SSE41-NEXT: pmulhuw %xmm2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: pr38477: Index: test/CodeGen/X86/funnel-shift-rot.ll =================================================================== --- test/CodeGen/X86/funnel-shift-rot.ll +++ test/CodeGen/X86/funnel-shift-rot.ll @@ -127,8 +127,7 @@ ; X32-SSE2-NEXT: pmuludq %xmm3, %xmm1 ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE2-NEXT: orps %xmm0, %xmm2 -; X32-SSE2-NEXT: movaps %xmm2, %xmm0 +; X32-SSE2-NEXT: por %xmm2, %xmm0 ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: rotl_v4i32: @@ -281,37 +280,35 @@ ; X32-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31] ; X32-SSE2-NEXT: pxor %xmm3, %xmm3 ; X32-SSE2-NEXT: psubd %xmm1, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm4 -; X32-SSE2-NEXT: pand %xmm2, %xmm4 -; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,3,3,3,4,5,6,7] +; X32-SSE2-NEXT: pand %xmm2, %xmm1 +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] ; X32-SSE2-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE2-NEXT: psrld %xmm1, %xmm5 -; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,1,1,1,4,5,6,7] -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrld %xmm6, %xmm1 -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] +; X32-SSE2-NEXT: psrld %xmm4, %xmm5 +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; X32-SSE2-NEXT: movdqa %xmm0, %xmm6 -; X32-SSE2-NEXT: psrld %xmm5, %xmm6 -; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] +; X32-SSE2-NEXT: psrld %xmm4, %xmm6 +; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] ; X32-SSE2-NEXT: movdqa %xmm0, %xmm5 ; X32-SSE2-NEXT: psrld %xmm4, %xmm5 -; X32-SSE2-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; X32-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3] +; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; X32-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X32-SSE2-NEXT: psrld %xmm1, %xmm4 +; X32-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; X32-SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm4[0,3] ; X32-SSE2-NEXT: pand %xmm2, %xmm3 ; X32-SSE2-NEXT: pslld $23, %xmm3 ; X32-SSE2-NEXT: paddd {{\.LCPI.*}}, %xmm3 -; X32-SSE2-NEXT: cvttps2dq %xmm3, %xmm2 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X32-SSE2-NEXT: pmuludq %xmm2, %xmm0 +; X32-SSE2-NEXT: cvttps2dq %xmm3, %xmm1 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X32-SSE2-NEXT: pmuludq %xmm1, %xmm0 ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X32-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: orps %xmm0, %xmm1 -; X32-SSE2-NEXT: movaps %xmm1, %xmm0 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X32-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: por %xmm6, %xmm0 ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: rotr_v4i32: Index: test/CodeGen/X86/funnel-shift.ll =================================================================== --- test/CodeGen/X86/funnel-shift.ll +++ test/CodeGen/X86/funnel-shift.ll @@ -119,10 +119,9 @@ ; X64-AVX2-NEXT: subl %r8d, %ecx ; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-AVX2-NEXT: shrq %cl, %rsi -; X64-AVX2-NEXT: orq %rax, %rsi +; X64-AVX2-NEXT: orq %rsi, %rax ; X64-AVX2-NEXT: testq %r8, %r8 -; X64-AVX2-NEXT: cmoveq %rdi, %rsi -; X64-AVX2-NEXT: movq %rsi, %rax +; X64-AVX2-NEXT: cmoveq %rdi, %rax ; X64-AVX2-NEXT: retq %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z) ret i37 %f @@ -313,10 +312,10 @@ ; X64-AVX2-NEXT: subl %r8d, %ecx ; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-AVX2-NEXT: shlq %cl, %rdi -; X64-AVX2-NEXT: orq %r9, %rdi +; X64-AVX2-NEXT: orq %rdi, %r9 ; X64-AVX2-NEXT: testq %r8, %r8 -; X64-AVX2-NEXT: cmoveq %rsi, %rdi -; X64-AVX2-NEXT: movq %rdi, %rax +; X64-AVX2-NEXT: cmoveq %rsi, %r9 +; X64-AVX2-NEXT: movq %r9, %rax ; X64-AVX2-NEXT: retq %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z) ret i37 %f Index: test/CodeGen/X86/haddsub-undef.ll =================================================================== --- test/CodeGen/X86/haddsub-undef.ll +++ test/CodeGen/X86/haddsub-undef.ll @@ -104,8 +104,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test5_undef: Index: test/CodeGen/X86/iabs.ll =================================================================== --- test/CodeGen/X86/iabs.ll +++ test/CodeGen/X86/iabs.ll @@ -22,11 +22,9 @@ ; X64-LABEL: test_i8: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: sarb $7, %cl -; X64-NEXT: addb %cl, %al -; X64-NEXT: xorb %cl, %al -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: sarb $7, %al +; X64-NEXT: addb %al, %dil +; X64-NEXT: xorb %dil, %al ; X64-NEXT: retq %tmp1neg = sub i8 0, %a %b = icmp sgt i8 %a, -1 Index: test/CodeGen/X86/imul.ll =================================================================== --- test/CodeGen/X86/imul.ll +++ test/CodeGen/X86/imul.ll @@ -215,10 +215,9 @@ define i32 @mul33_32(i32 %A) { ; X64-LABEL: mul33_32: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $5, %eax -; X64-NEXT: leal (%rax,%rdi), %eax +; X64-NEXT: addl %edi, %eax ; X64-NEXT: retq ; ; X86-LABEL: mul33_32: @@ -344,10 +343,9 @@ define i32 @test2(i32 %a) { ; X64-LABEL: test2: ; X64: # %bb.0: # %entry -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $5, %eax -; X64-NEXT: leal (%rax,%rdi), %eax +; X64-NEXT: addl %edi, %eax ; X64-NEXT: retq ; ; X86-LABEL: test2: @@ -365,10 +363,9 @@ define i32 @test3(i32 %a) { ; X64-LABEL: test3: ; X64: # %bb.0: # %entry -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $5, %eax -; X64-NEXT: leal (%rax,%rdi), %eax +; X64-NEXT: addl %edi, %eax ; X64-NEXT: negl %eax ; X64-NEXT: retq ; @@ -446,7 +443,7 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: shlq $5, %rax -; X64-NEXT: leaq (%rax,%rdi), %rax +; X64-NEXT: addq %rdi, %rax ; X64-NEXT: retq ; ; X86-LABEL: test6: @@ -469,7 +466,7 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: shlq $5, %rax -; X64-NEXT: leaq (%rax,%rdi), %rax +; X64-NEXT: addq %rdi, %rax ; X64-NEXT: negq %rax ; X64-NEXT: retq ; Index: test/CodeGen/X86/mul-constant-i16.ll =================================================================== --- test/CodeGen/X86/mul-constant-i16.ll +++ test/CodeGen/X86/mul-constant-i16.ll @@ -318,10 +318,9 @@ ; ; X64-LABEL: test_mul_by_17: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $4, %eax -; X64-NEXT: leal (%rax,%rdi), %eax +; X64-NEXT: addl %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %mul = mul nsw i16 %x, 17 Index: test/CodeGen/X86/mul-constant-i32.ll =================================================================== --- test/CodeGen/X86/mul-constant-i32.ll +++ test/CodeGen/X86/mul-constant-i32.ll @@ -841,18 +841,16 @@ ; ; X64-HSW-LABEL: test_mul_by_17: ; X64-HSW: # %bb.0: -; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi ; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] ; X64-HSW-NEXT: shll $4, %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50] +; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] ; X64-HSW-NEXT: retq # sched: [7:1.00] ; ; X64-JAG-LABEL: test_mul_by_17: ; X64-JAG: # %bb.0: -; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi ; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50] ; X64-JAG-NEXT: shll $4, %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50] +; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] ; X64-JAG-NEXT: retq # sched: [4:1.00] ; ; X86-NOOPT-LABEL: test_mul_by_17: @@ -872,10 +870,9 @@ ; ; X64-SLM-LABEL: test_mul_by_17: ; X64-SLM: # %bb.0: -; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi ; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] ; X64-SLM-NEXT: shll $4, %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rdi), %eax # sched: [1:1.00] +; X64-SLM-NEXT: addl %edi, %eax # sched: [1:0.50] ; X64-SLM-NEXT: retq # sched: [4:1.00] ; ; SLM-NOOPT-LABEL: test_mul_by_17: @@ -1875,10 +1872,9 @@ ; ; X64-SLM-LABEL: test_mul_by_66: ; X64-SLM: # %bb.0: -; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi ; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] ; X64-SLM-NEXT: shll $6, %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rdi), %eax # sched: [1:1.00] +; X64-SLM-NEXT: addl %edi, %eax # sched: [1:0.50] ; X64-SLM-NEXT: addl %edi, %eax # sched: [1:0.50] ; X64-SLM-NEXT: retq # sched: [4:1.00] ; Index: test/CodeGen/X86/mul-constant-i64.ll =================================================================== --- test/CodeGen/X86/mul-constant-i64.ll +++ test/CodeGen/X86/mul-constant-i64.ll @@ -872,14 +872,14 @@ ; X64-HSW: # %bb.0: ; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] ; X64-HSW-NEXT: shlq $4, %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] +; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] ; X64-HSW-NEXT: retq # sched: [7:1.00] ; ; X64-JAG-LABEL: test_mul_by_17: ; X64-JAG: # %bb.0: ; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.50] ; X64-JAG-NEXT: shlq $4, %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] +; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] ; X64-JAG-NEXT: retq # sched: [4:1.00] ; ; X86-NOOPT-LABEL: test_mul_by_17: Index: test/CodeGen/X86/paddus.ll =================================================================== --- test/CodeGen/X86/paddus.ll +++ test/CodeGen/X86/paddus.ll @@ -800,24 +800,22 @@ define <8 x i16> @test23(<8 x i16> %x) { ; SSE2-LABEL: test23: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtw %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test23: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtw %xmm2, %xmm1 -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtw %xmm1, %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test23: @@ -1029,38 +1027,34 @@ define <16 x i16> @test29(<16 x i16> %x) { ; SSE2-LABEL: test29: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm2, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test29: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pcmpgtw %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm0, %xmm2 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpgtw %xmm4, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtw %xmm2, %xmm3 +; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test29: @@ -1352,25 +1346,21 @@ ; SSE2-NEXT: pxor %xmm4, %xmm5 ; SSE2-NEXT: movdqa %xmm3, %xmm8 ; SSE2-NEXT: pcmpgtw %xmm5, %xmm8 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 ; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm6, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm4, %xmm7 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: pcmpgtw %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pcmpgtw %xmm7, %xmm6 +; SSE2-NEXT: pcmpgtw %xmm5, %xmm6 ; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm7 -; SSE2-NEXT: por %xmm0, %xmm7 -; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: por %xmm3, %xmm8 -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test35: @@ -1384,25 +1374,21 @@ ; SSSE3-NEXT: pxor %xmm4, %xmm5 ; SSSE3-NEXT: movdqa %xmm3, %xmm8 ; SSSE3-NEXT: pcmpgtw %xmm5, %xmm8 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm4, %xmm6 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtw %xmm6, %xmm5 -; SSSE3-NEXT: movdqa %xmm1, %xmm7 -; SSSE3-NEXT: pxor %xmm4, %xmm7 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm7 +; SSSE3-NEXT: pcmpgtw %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 ; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: pcmpgtw %xmm7, %xmm6 +; SSSE3-NEXT: pcmpgtw %xmm5, %xmm6 ; SSSE3-NEXT: pxor %xmm0, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm7 -; SSSE3-NEXT: pcmpgtw %xmm4, %xmm7 -; SSSE3-NEXT: por %xmm0, %xmm7 -; SSSE3-NEXT: por %xmm1, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm3, %xmm8 -; SSSE3-NEXT: movdqa %xmm7, %xmm0 -; SSSE3-NEXT: movdqa %xmm6, %xmm1 -; SSSE3-NEXT: movdqa %xmm5, %xmm2 -; SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: pcmpgtw %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: por %xmm6, %xmm1 +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: por %xmm8, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test35: Index: test/CodeGen/X86/palignr.ll =================================================================== --- test/CodeGen/X86/palignr.ll +++ test/CodeGen/X86/palignr.ll @@ -167,8 +167,8 @@ ; CHECK-SSE2-LABEL: test9: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; CHECK-SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; CHECK-SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; CHECK-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] ; CHECK-SSE2-NEXT: por %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retl ; Index: test/CodeGen/X86/pmul.ll =================================================================== --- test/CodeGen/X86/pmul.ll +++ test/CodeGen/X86/pmul.ll @@ -1084,11 +1084,11 @@ ; SSE2-LABEL: mul_v4i64_zero_upper: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 @@ -1238,27 +1238,27 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; SSE2-LABEL: mul_v8i64_zero_upper: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE2-NEXT: pmuludq %xmm7, %xmm4 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: pmuludq %xmm6, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE2-NEXT: pmuludq %xmm0, %xmm5 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: pmuludq %xmm7, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] ; SSE2-NEXT: movaps %xmm4, %xmm0 -; SSE2-NEXT: movaps %xmm5, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v8i64_zero_upper: Index: test/CodeGen/X86/popcnt.ll =================================================================== --- test/CodeGen/X86/popcnt.ll +++ test/CodeGen/X86/popcnt.ll @@ -223,7 +223,7 @@ ; X64-NEXT: addq %rcx, %rdi ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: shrq $4, %rax -; X64-NEXT: leaq (%rax,%rdi), %rax +; X64-NEXT: addq %rdi, %rax ; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F ; X64-NEXT: andq %rax, %rcx ; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 Index: test/CodeGen/X86/powi.ll =================================================================== --- test/CodeGen/X86/powi.ll +++ test/CodeGen/X86/powi.ll @@ -10,8 +10,7 @@ ; CHECK-NEXT: mulsd %xmm1, %xmm1 ; CHECK-NEXT: mulsd %xmm1, %xmm0 ; CHECK-NEXT: mulsd %xmm1, %xmm1 -; CHECK-NEXT: mulsd %xmm0, %xmm1 -; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: mulsd %xmm1, %xmm0 ; CHECK-NEXT: retq %ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; [#uses=1] ret double %ret Index: test/CodeGen/X86/psubus.ll =================================================================== --- test/CodeGen/X86/psubus.ll +++ test/CodeGen/X86/psubus.ll @@ -1428,24 +1428,25 @@ ; SSE2-LABEL: psubus_8i32_max: ; SSE2: # %bb.0: # %vector.ph ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm6 ; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: psubd %xmm1, %xmm0 ; SSE2-NEXT: psubd %xmm2, %xmm4 @@ -1816,54 +1817,55 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { ; SSE2-LABEL: psubus_16i32_max: ; SSE2: # %bb.0: # %vector.ph -; SSE2-NEXT: movdqa %xmm1, %xmm8 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: por %xmm7, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm9 -; SSE2-NEXT: por %xmm0, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm7 +; SSE2-NEXT: por %xmm0, %xmm7 ; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 ; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm10 ; SSE2-NEXT: pandn %xmm2, %xmm0 ; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pxor %xmm7, %xmm10 -; SSE2-NEXT: movdqa %xmm8, %xmm6 -; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm10 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: por %xmm1, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm10, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm8 +; SSE2-NEXT: pand %xmm6, %xmm9 ; SSE2-NEXT: pandn %xmm5, %xmm6 -; SSE2-NEXT: por %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: pxor %xmm7, %xmm8 -; SSE2-NEXT: por %xmm1, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pandn %xmm4, %xmm7 -; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: por %xmm9, %xmm6 +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: por %xmm8, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm8 +; SSE2-NEXT: pandn %xmm4, %xmm1 +; SSE2-NEXT: por %xmm8, %xmm1 ; SSE2-NEXT: psubd %xmm4, %xmm1 ; SSE2-NEXT: psubd %xmm5, %xmm6 ; SSE2-NEXT: psubd %xmm2, %xmm0 -; SSE2-NEXT: psubd %xmm3, %xmm9 -; SSE2-NEXT: pslld $16, %xmm9 -; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: psubd %xmm3, %xmm7 +; SSE2-NEXT: pslld $16, %xmm7 +; SSE2-NEXT: psrad $16, %xmm7 ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm9, %xmm0 +; SSE2-NEXT: packssdw %xmm7, %xmm0 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 @@ -1873,54 +1875,55 @@ ; ; SSSE3-LABEL: psubus_16i32_max: ; SSSE3: # %bb.0: # %vector.ph -; SSSE3-NEXT: movdqa %xmm1, %xmm8 -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm9 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm8 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] ; SSSE3-NEXT: movdqa %xmm0, %xmm10 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSSE3-NEXT: pxor %xmm7, %xmm6 -; SSSE3-NEXT: movdqa %xmm0, %xmm9 -; SSSE3-NEXT: por %xmm7, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm9 -; SSSE3-NEXT: pand %xmm9, %xmm0 -; SSSE3-NEXT: pandn %xmm3, %xmm9 -; SSSE3-NEXT: por %xmm0, %xmm9 +; SSSE3-NEXT: pxor %xmm1, %xmm6 +; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pandn %xmm3, %xmm7 +; SSSE3-NEXT: por %xmm0, %xmm7 ; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm7, %xmm6 +; SSSE3-NEXT: pxor %xmm1, %xmm6 ; SSSE3-NEXT: movdqa %xmm10, %xmm0 -; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 ; SSSE3-NEXT: pand %xmm0, %xmm10 ; SSSE3-NEXT: pandn %xmm2, %xmm0 ; SSSE3-NEXT: por %xmm10, %xmm0 ; SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSSE3-NEXT: pxor %xmm7, %xmm10 -; SSSE3-NEXT: movdqa %xmm8, %xmm6 -; SSSE3-NEXT: por %xmm7, %xmm6 +; SSSE3-NEXT: pxor %xmm1, %xmm10 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: por %xmm1, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm10, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm8 +; SSSE3-NEXT: pand %xmm6, %xmm9 ; SSSE3-NEXT: pandn %xmm5, %xmm6 -; SSSE3-NEXT: por %xmm8, %xmm6 -; SSSE3-NEXT: movdqa %xmm4, %xmm8 -; SSSE3-NEXT: pxor %xmm7, %xmm8 -; SSSE3-NEXT: por %xmm1, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: pandn %xmm4, %xmm7 -; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: por %xmm9, %xmm6 +; SSSE3-NEXT: movdqa %xmm4, %xmm9 +; SSSE3-NEXT: pxor %xmm1, %xmm9 +; SSSE3-NEXT: por %xmm8, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm8 +; SSSE3-NEXT: pandn %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm8, %xmm1 ; SSSE3-NEXT: psubd %xmm4, %xmm1 ; SSSE3-NEXT: psubd %xmm5, %xmm6 ; SSSE3-NEXT: psubd %xmm2, %xmm0 -; SSSE3-NEXT: psubd %xmm3, %xmm9 -; SSSE3-NEXT: pslld $16, %xmm9 -; SSSE3-NEXT: psrad $16, %xmm9 +; SSSE3-NEXT: psubd %xmm3, %xmm7 +; SSSE3-NEXT: pslld $16, %xmm7 +; SSSE3-NEXT: psrad $16, %xmm7 ; SSSE3-NEXT: pslld $16, %xmm0 ; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: packssdw %xmm9, %xmm0 +; SSSE3-NEXT: packssdw %xmm7, %xmm0 ; SSSE3-NEXT: pslld $16, %xmm6 ; SSSE3-NEXT: psrad $16, %xmm6 ; SSSE3-NEXT: pslld $16, %xmm1 Index: test/CodeGen/X86/rotate-extract.ll =================================================================== --- test/CodeGen/X86/rotate-extract.ll +++ test/CodeGen/X86/rotate-extract.ll @@ -156,7 +156,7 @@ ; X64-NEXT: shlq $5, %rax ; X64-NEXT: shlq $10, %rdi ; X64-NEXT: shrq $57, %rax -; X64-NEXT: leaq (%rax,%rdi), %rax +; X64-NEXT: orq %rdi, %rax ; X64-NEXT: retq %lhs_mul = shl i64 %i, 5 %rhs_mul = shl i64 %i, 10 @@ -179,12 +179,11 @@ ; ; X64-LABEL: no_extract_shrl: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: andl $-8, %eax ; X64-NEXT: shll $25, %eax ; X64-NEXT: shrl $9, %edi -; X64-NEXT: leal (%rdi,%rax), %eax +; X64-NEXT: orl %edi, %eax ; X64-NEXT: retq %lhs_div = lshr i32 %i, 3 %rhs_div = lshr i32 %i, 9 Index: test/CodeGen/X86/sat-add.ll =================================================================== --- test/CodeGen/X86/sat-add.ll +++ test/CodeGen/X86/sat-add.ll @@ -241,12 +241,11 @@ define i16 @unsigned_sat_variable_i16_using_min(i16 %x, i16 %y) { ; ANY-LABEL: unsigned_sat_variable_i16_using_min: ; ANY: # %bb.0: -; ANY-NEXT: # kill: def $esi killed $esi def $rsi ; ANY-NEXT: movl %esi, %eax ; ANY-NEXT: notl %eax ; ANY-NEXT: cmpw %ax, %di ; ANY-NEXT: cmovbl %edi, %eax -; ANY-NEXT: leal (%rax,%rsi), %eax +; ANY-NEXT: addl %esi, %eax ; ANY-NEXT: # kill: def $ax killed $ax killed $eax ; ANY-NEXT: retq %noty = xor i16 %y, -1 @@ -292,12 +291,11 @@ define i32 @unsigned_sat_variable_i32_using_min(i32 %x, i32 %y) { ; ANY-LABEL: unsigned_sat_variable_i32_using_min: ; ANY: # %bb.0: -; ANY-NEXT: # kill: def $esi killed $esi def $rsi ; ANY-NEXT: movl %esi, %eax ; ANY-NEXT: notl %eax ; ANY-NEXT: cmpl %eax, %edi ; ANY-NEXT: cmovbl %edi, %eax -; ANY-NEXT: leal (%rax,%rsi), %eax +; ANY-NEXT: addl %esi, %eax ; ANY-NEXT: retq %noty = xor i32 %y, -1 %c = icmp ult i32 %x, %noty @@ -344,7 +342,7 @@ ; ANY-NEXT: notq %rax ; ANY-NEXT: cmpq %rax, %rdi ; ANY-NEXT: cmovbq %rdi, %rax -; ANY-NEXT: leaq (%rax,%rsi), %rax +; ANY-NEXT: addq %rsi, %rax ; ANY-NEXT: retq %noty = xor i64 %y, -1 %c = icmp ult i64 %x, %noty @@ -497,15 +495,14 @@ ; ; SSE41-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [42,42,42,42] -; SSE41-NEXT: paddd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pminud %xmm2, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42,42,42,42] +; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pminud %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, %a @@ -795,11 +792,10 @@ ; SSE41-NEXT: paddd %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pminud %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm2 -; SSE41-NEXT: por %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq %a = add <4 x i32> %x, %y %c = icmp ugt <4 x i32> %x, %a Index: test/CodeGen/X86/sqrt-fastmath.ll =================================================================== --- test/CodeGen/X86/sqrt-fastmath.ll +++ test/CodeGen/X86/sqrt-fastmath.ll @@ -175,17 +175,17 @@ define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 { ; SSE-LABEL: sqrt_v4f32_check_denorms: ; SSE: # %bb.0: -; SSE-NEXT: rsqrtps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm2, %xmm1 +; SSE-NEXT: rsqrtps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.000000e-01,-5.000000e-01,-5.000000e-01,-5.000000e-01] -; SSE-NEXT: mulps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: addps {{.*}}(%rip), %xmm1 -; SSE-NEXT: mulps %xmm3, %xmm1 +; SSE-NEXT: mulps %xmm2, %xmm3 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps {{.*}}(%rip), %xmm2 +; SSE-NEXT: mulps %xmm3, %xmm2 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.175494e-38,1.175494e-38,1.175494e-38,1.175494e-38] -; SSE-NEXT: cmpleps %xmm0, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.175494e-38,1.175494e-38,1.175494e-38,1.175494e-38] +; SSE-NEXT: cmpleps %xmm0, %xmm1 ; SSE-NEXT: andps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq Index: test/CodeGen/X86/sse-minmax.ll =================================================================== --- test/CodeGen/X86/sse-minmax.ll +++ test/CodeGen/X86/sse-minmax.ll @@ -103,8 +103,7 @@ ; STRICT-NEXT: cmplesd %xmm1, %xmm2 ; STRICT-NEXT: andpd %xmm2, %xmm0 ; STRICT-NEXT: andnpd %xmm1, %xmm2 -; STRICT-NEXT: orpd %xmm0, %xmm2 -; STRICT-NEXT: movapd %xmm2, %xmm0 +; STRICT-NEXT: orpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ole: @@ -261,11 +260,10 @@ define double @ole_x(double %x) { ; STRICT-LABEL: ole_x: ; STRICT: # %bb.0: -; STRICT-NEXT: xorpd %xmm2, %xmm2 -; STRICT-NEXT: movapd %xmm0, %xmm1 -; STRICT-NEXT: cmplesd %xmm2, %xmm1 -; STRICT-NEXT: andpd %xmm0, %xmm1 -; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: xorpd %xmm1, %xmm1 +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: cmplesd %xmm1, %xmm2 +; STRICT-NEXT: andpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ole_x: @@ -338,8 +336,7 @@ ; STRICT-NEXT: cmpnlesd %xmm1, %xmm2 ; STRICT-NEXT: andpd %xmm2, %xmm0 ; STRICT-NEXT: andnpd %xmm1, %xmm2 -; STRICT-NEXT: orpd %xmm0, %xmm2 -; STRICT-NEXT: movapd %xmm2, %xmm0 +; STRICT-NEXT: orpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ugt: @@ -499,11 +496,10 @@ define double @ugt_x(double %x) { ; STRICT-LABEL: ugt_x: ; STRICT: # %bb.0: -; STRICT-NEXT: xorpd %xmm2, %xmm2 -; STRICT-NEXT: movapd %xmm0, %xmm1 -; STRICT-NEXT: cmpnlesd %xmm2, %xmm1 -; STRICT-NEXT: andpd %xmm0, %xmm1 -; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: xorpd %xmm1, %xmm1 +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: cmpnlesd %xmm1, %xmm2 +; STRICT-NEXT: andpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ugt_x: @@ -762,13 +758,12 @@ define double @ole_y(double %x) { ; STRICT-LABEL: ole_y: ; STRICT: # %bb.0: -; STRICT-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; STRICT-NEXT: movapd %xmm0, %xmm1 -; STRICT-NEXT: cmplesd %xmm2, %xmm1 -; STRICT-NEXT: andpd %xmm1, %xmm0 -; STRICT-NEXT: andnpd %xmm2, %xmm1 -; STRICT-NEXT: orpd %xmm0, %xmm1 -; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: cmplesd %xmm1, %xmm2 +; STRICT-NEXT: andpd %xmm2, %xmm0 +; STRICT-NEXT: andnpd %xmm1, %xmm2 +; STRICT-NEXT: orpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ole_y: @@ -839,13 +834,12 @@ define double @ugt_y(double %x) { ; STRICT-LABEL: ugt_y: ; STRICT: # %bb.0: -; STRICT-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; STRICT-NEXT: movapd %xmm0, %xmm1 -; STRICT-NEXT: cmpnlesd %xmm2, %xmm1 -; STRICT-NEXT: andpd %xmm1, %xmm0 -; STRICT-NEXT: andnpd %xmm2, %xmm1 -; STRICT-NEXT: orpd %xmm0, %xmm1 -; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: cmpnlesd %xmm1, %xmm2 +; STRICT-NEXT: andpd %xmm2, %xmm0 +; STRICT-NEXT: andnpd %xmm1, %xmm2 +; STRICT-NEXT: orpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ugt_y: Index: test/CodeGen/X86/sse2-intrinsics-canonical.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-canonical.ll +++ test/CodeGen/X86/sse2-intrinsics-canonical.ll @@ -236,9 +236,8 @@ ; SSE-NEXT: pcmpgtd %xmm3, %xmm2 ## encoding: [0x66,0x0f,0x66,0xd3] ; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] ; SSE-NEXT: pandn %xmm3, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd3] -; SSE-NEXT: por %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xeb,0xd0] -; SSE-NEXT: psubd %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xfa,0xd1] -; SSE-NEXT: movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2] +; SSE-NEXT: por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2] +; SSE-NEXT: psubd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xfa,0xc1] ; SSE-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_x86_sse2_psubus_w_64: Index: test/CodeGen/X86/sse2-schedule.ll =================================================================== --- test/CodeGen/X86/sse2-schedule.ll +++ test/CodeGen/X86/sse2-schedule.ll @@ -7705,8 +7705,7 @@ ; GENERIC-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: pandn (%rdi), %xmm1 # sched: [7:0.50] -; GENERIC-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] -; GENERIC-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; ATOM-LABEL: test_pandn: @@ -7714,8 +7713,7 @@ ; ATOM-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.50] ; ATOM-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.50] ; ATOM-NEXT: pandn (%rdi), %xmm1 # sched: [1:1.00] -; ATOM-NEXT: paddq %xmm0, %xmm1 # sched: [2:1.00] -; ATOM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; ATOM-NEXT: paddq %xmm1, %xmm0 # sched: [2:1.00] ; ATOM-NEXT: retq # sched: [79:39.50] ; ; SLM-LABEL: test_pandn: @@ -7723,8 +7721,7 @@ ; SLM-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.50] ; SLM-NEXT: pandn (%rdi), %xmm1 # sched: [4:1.00] -; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] -; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-SSE-LABEL: test_pandn: @@ -7732,8 +7729,7 @@ ; SANDY-SSE-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.33] ; SANDY-SSE-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.33] ; SANDY-SSE-NEXT: pandn (%rdi), %xmm1 # sched: [7:0.50] -; SANDY-SSE-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] -; SANDY-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] ; SANDY-SSE-NEXT: retq # sched: [1:1.00] ; ; SANDY-LABEL: test_pandn: @@ -7748,8 +7744,7 @@ ; HASWELL-SSE-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.33] ; HASWELL-SSE-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.33] ; HASWELL-SSE-NEXT: pandn (%rdi), %xmm1 # sched: [7:0.50] -; HASWELL-SSE-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] -; HASWELL-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] ; HASWELL-SSE-NEXT: retq # sched: [7:1.00] ; ; HASWELL-LABEL: test_pandn: @@ -7764,8 +7759,7 @@ ; BROADWELL-SSE-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.33] ; BROADWELL-SSE-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.33] ; BROADWELL-SSE-NEXT: pandn (%rdi), %xmm1 # sched: [6:0.50] -; BROADWELL-SSE-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] -; BROADWELL-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.33] +; BROADWELL-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] ; BROADWELL-SSE-NEXT: retq # sched: [7:1.00] ; ; BROADWELL-LABEL: test_pandn: @@ -7780,8 +7774,7 @@ ; SKYLAKE-SSE-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.33] ; SKYLAKE-SSE-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.33] ; SKYLAKE-SSE-NEXT: pandn (%rdi), %xmm1 # sched: [7:0.50] -; SKYLAKE-SSE-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.33] -; SKYLAKE-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.33] +; SKYLAKE-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.33] ; SKYLAKE-SSE-NEXT: retq # sched: [7:1.00] ; ; SKYLAKE-LABEL: test_pandn: @@ -7796,8 +7789,7 @@ ; SKX-SSE-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.33] ; SKX-SSE-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.33] ; SKX-SSE-NEXT: pandn (%rdi), %xmm1 # sched: [7:0.50] -; SKX-SSE-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.33] -; SKX-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.33] +; SKX-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.33] ; SKX-SSE-NEXT: retq # sched: [7:1.00] ; ; SKX-LABEL: test_pandn: @@ -7812,8 +7804,7 @@ ; BTVER2-SSE-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-SSE-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.50] ; BTVER2-SSE-NEXT: pandn (%rdi), %xmm1 # sched: [6:1.00] -; BTVER2-SSE-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] -; BTVER2-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-SSE-NEXT: retq # sched: [4:1.00] ; ; BTVER2-LABEL: test_pandn: @@ -7828,8 +7819,7 @@ ; ZNVER1-SSE-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.25] ; ZNVER1-SSE-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.25] ; ZNVER1-SSE-NEXT: pandn (%rdi), %xmm1 # sched: [8:0.50] -; ZNVER1-SSE-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.25] -; ZNVER1-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.25] ; ZNVER1-SSE-NEXT: retq # sched: [1:0.50] ; ; ZNVER1-LABEL: test_pandn: Index: test/CodeGen/X86/sse41-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse41-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse41-intrinsics-fast-isel.ll @@ -835,13 +835,12 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psllq $32, %xmm2 ; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: psrad $31, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE-NEXT: pmuldq %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE-NEXT: pmuldq %xmm2, %xmm0 ; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: test_mm_mul_epi32: Index: test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll =================================================================== --- test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll +++ test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll @@ -40,10 +40,9 @@ ; X64-LABEL: test__blcic_u64: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: xorq $-1, %rcx -; X64-NEXT: addq $1, %rax -; X64-NEXT: andq %rcx, %rax +; X64-NEXT: xorq $-1, %rax +; X64-NEXT: addq $1, %rdi +; X64-NEXT: andq %rdi, %rax ; X64-NEXT: retq %1 = xor i64 %a0, -1 %2 = add i64 %a0, 1 @@ -89,10 +88,9 @@ ; X64-LABEL: test__blsic_u64: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: xorq $-1, %rcx -; X64-NEXT: subq $1, %rax -; X64-NEXT: orq %rcx, %rax +; X64-NEXT: xorq $-1, %rax +; X64-NEXT: subq $1, %rdi +; X64-NEXT: orq %rdi, %rax ; X64-NEXT: retq %1 = xor i64 %a0, -1 %2 = sub i64 %a0, 1 @@ -104,10 +102,9 @@ ; X64-LABEL: test__t1mskc_u64: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: xorq $-1, %rcx -; X64-NEXT: addq $1, %rax -; X64-NEXT: orq %rcx, %rax +; X64-NEXT: xorq $-1, %rax +; X64-NEXT: addq $1, %rdi +; X64-NEXT: orq %rdi, %rax ; X64-NEXT: retq %1 = xor i64 %a0, -1 %2 = add i64 %a0, 1 @@ -119,10 +116,9 @@ ; X64-LABEL: test__tzmsk_u64: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: xorq $-1, %rcx -; X64-NEXT: subq $1, %rax -; X64-NEXT: andq %rcx, %rax +; X64-NEXT: xorq $-1, %rax +; X64-NEXT: subq $1, %rdi +; X64-NEXT: andq %rdi, %rax ; X64-NEXT: retq %1 = xor i64 %a0, -1 %2 = sub i64 %a0, 1 Index: test/CodeGen/X86/tbm-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/tbm-intrinsics-fast-isel.ll +++ test/CodeGen/X86/tbm-intrinsics-fast-isel.ll @@ -72,10 +72,9 @@ ; X64-LABEL: test__blcic_u32: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: xorl $-1, %ecx -; X64-NEXT: addl $1, %eax -; X64-NEXT: andl %ecx, %eax +; X64-NEXT: xorl $-1, %eax +; X64-NEXT: addl $1, %edi +; X64-NEXT: andl %edi, %eax ; X64-NEXT: retq %1 = xor i32 %a0, -1 %2 = add i32 %a0, 1 @@ -154,10 +153,9 @@ ; X64-LABEL: test__blsic_u32: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: xorl $-1, %ecx -; X64-NEXT: subl $1, %eax -; X64-NEXT: orl %ecx, %eax +; X64-NEXT: xorl $-1, %eax +; X64-NEXT: subl $1, %edi +; X64-NEXT: orl %edi, %eax ; X64-NEXT: retq %1 = xor i32 %a0, -1 %2 = sub i32 %a0, 1 @@ -178,10 +176,9 @@ ; X64-LABEL: test__t1mskc_u32: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: xorl $-1, %ecx -; X64-NEXT: addl $1, %eax -; X64-NEXT: orl %ecx, %eax +; X64-NEXT: xorl $-1, %eax +; X64-NEXT: addl $1, %edi +; X64-NEXT: orl %edi, %eax ; X64-NEXT: retq %1 = xor i32 %a0, -1 %2 = add i32 %a0, 1 @@ -202,10 +199,9 @@ ; X64-LABEL: test__tzmsk_u32: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: xorl $-1, %ecx -; X64-NEXT: subl $1, %eax -; X64-NEXT: andl %ecx, %eax +; X64-NEXT: xorl $-1, %eax +; X64-NEXT: subl $1, %edi +; X64-NEXT: andl %edi, %eax ; X64-NEXT: retq %1 = xor i32 %a0, -1 %2 = sub i32 %a0, 1 Index: test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll =================================================================== --- test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll +++ test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll @@ -545,18 +545,18 @@ define i32 @out_constant_varx_mone(i32 %x, i32 %y, i32 %mask) { ; CHECK-NOBMI-LABEL: out_constant_varx_mone: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %edi, %eax -; CHECK-NOBMI-NEXT: andl %edx, %eax -; CHECK-NOBMI-NEXT: notl %edx -; CHECK-NOBMI-NEXT: orl %edx, %eax +; CHECK-NOBMI-NEXT: andl %edx, %edi +; CHECK-NOBMI-NEXT: movl %edx, %eax +; CHECK-NOBMI-NEXT: notl %eax +; CHECK-NOBMI-NEXT: orl %edi, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_varx_mone: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %edi, %eax -; CHECK-BMI-NEXT: andl %edx, %eax -; CHECK-BMI-NEXT: notl %edx -; CHECK-BMI-NEXT: orl %edx, %eax +; CHECK-BMI-NEXT: andl %edx, %edi +; CHECK-BMI-NEXT: movl %edx, %eax +; CHECK-BMI-NEXT: notl %eax +; CHECK-BMI-NEXT: orl %edi, %eax ; CHECK-BMI-NEXT: retq %notmask = xor i32 %mask, -1 %mx = and i32 %mask, %x @@ -676,11 +676,10 @@ ; CHECK-NOBMI-LABEL: out_constant_varx_42_invmask: ; CHECK-NOBMI: # %bb.0: ; CHECK-NOBMI-NEXT: movl %edx, %eax -; CHECK-NOBMI-NEXT: movl %edx, %ecx -; CHECK-NOBMI-NEXT: notl %ecx -; CHECK-NOBMI-NEXT: andl %edi, %ecx -; CHECK-NOBMI-NEXT: andl $42, %eax -; CHECK-NOBMI-NEXT: orl %ecx, %eax +; CHECK-NOBMI-NEXT: notl %eax +; CHECK-NOBMI-NEXT: andl %edi, %eax +; CHECK-NOBMI-NEXT: andl $42, %edx +; CHECK-NOBMI-NEXT: orl %edx, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_varx_42_invmask: @@ -762,18 +761,18 @@ define i32 @out_constant_mone_vary_invmask(i32 %x, i32 %y, i32 %mask) { ; CHECK-NOBMI-LABEL: out_constant_mone_vary_invmask: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %esi, %eax -; CHECK-NOBMI-NEXT: andl %edx, %eax -; CHECK-NOBMI-NEXT: notl %edx -; CHECK-NOBMI-NEXT: orl %edx, %eax +; CHECK-NOBMI-NEXT: andl %edx, %esi +; CHECK-NOBMI-NEXT: movl %edx, %eax +; CHECK-NOBMI-NEXT: notl %eax +; CHECK-NOBMI-NEXT: orl %esi, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_mone_vary_invmask: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %esi, %eax -; CHECK-BMI-NEXT: andl %edx, %eax -; CHECK-BMI-NEXT: notl %edx -; CHECK-BMI-NEXT: orl %edx, %eax +; CHECK-BMI-NEXT: andl %edx, %esi +; CHECK-BMI-NEXT: movl %edx, %eax +; CHECK-BMI-NEXT: notl %eax +; CHECK-BMI-NEXT: orl %esi, %eax ; CHECK-BMI-NEXT: retq %notmask = xor i32 %mask, -1 %mx = and i32 %notmask, -1 @@ -850,20 +849,20 @@ define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) { ; CHECK-NOBMI-LABEL: out_constant_42_vary_invmask: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %esi, %eax -; CHECK-NOBMI-NEXT: andl %edx, %eax -; CHECK-NOBMI-NEXT: notl %edx -; CHECK-NOBMI-NEXT: andl $42, %edx -; CHECK-NOBMI-NEXT: orl %edx, %eax +; CHECK-NOBMI-NEXT: andl %edx, %esi +; CHECK-NOBMI-NEXT: movl %edx, %eax +; CHECK-NOBMI-NEXT: notl %eax +; CHECK-NOBMI-NEXT: andl $42, %eax +; CHECK-NOBMI-NEXT: orl %esi, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_42_vary_invmask: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %esi, %eax -; CHECK-BMI-NEXT: andl %edx, %eax -; CHECK-BMI-NEXT: notl %edx -; CHECK-BMI-NEXT: andl $42, %edx -; CHECK-BMI-NEXT: orl %edx, %eax +; CHECK-BMI-NEXT: andl %edx, %esi +; CHECK-BMI-NEXT: movl %edx, %eax +; CHECK-BMI-NEXT: notl %eax +; CHECK-BMI-NEXT: andl $42, %eax +; CHECK-BMI-NEXT: orl %esi, %eax ; CHECK-BMI-NEXT: retq %notmask = xor i32 %mask, -1 %mx = and i32 %notmask, 42 Index: test/CodeGen/X86/vec_ctbits.ll =================================================================== --- test/CodeGen/X86/vec_ctbits.ll +++ test/CodeGen/X86/vec_ctbits.ll @@ -94,11 +94,10 @@ ; CHECK-NEXT: paddq %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlq $4, %xmm1 -; CHECK-NEXT: paddq %xmm0, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: psadbw %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: paddq %xmm1, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) ret <2 x i64> %c @@ -189,23 +188,22 @@ ; CHECK-LABEL: prompop: ; CHECK: # %bb.0: ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pxor %xmm2, %xmm2 -; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $1, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: psubq %xmm1, %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: psrlq $1, %xmm2 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-NEXT: psubq %xmm2, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [3689348814741910323,3689348814741910323] ; CHECK-NEXT: movdqa %xmm0, %xmm3 -; CHECK-NEXT: pand %xmm1, %xmm3 +; CHECK-NEXT: pand %xmm2, %xmm3 ; CHECK-NEXT: psrlq $2, %xmm0 -; CHECK-NEXT: pand %xmm1, %xmm0 +; CHECK-NEXT: pand %xmm2, %xmm0 ; CHECK-NEXT: paddq %xmm3, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $4, %xmm1 -; CHECK-NEXT: paddq %xmm0, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: psadbw %xmm2, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: psrlq $4, %xmm2 +; CHECK-NEXT: paddq %xmm2, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) ret <2 x i32> %c Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -412,21 +412,20 @@ define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) { ; SSE-LABEL: uitofp_2i64_to_2f64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] -; SSE-NEXT: subpd %xmm4, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: subpd %xmm4, %xmm3 -; SSE-NEXT: movapd %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] +; SSE-NEXT: subpd %xmm3, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: addpd %xmm4, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: subpd %xmm3, %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: addpd %xmm2, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; VEX-LABEL: uitofp_2i64_to_2f64: @@ -694,34 +693,32 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { ; SSE-LABEL: uitofp_4i64_to_4f64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25] -; SSE-NEXT: subpd %xmm5, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: addpd %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: subpd %xmm5, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; SSE-NEXT: addpd %xmm4, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: subpd %xmm5, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE-NEXT: addpd %xmm1, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: subpd %xmm5, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; SSE-NEXT: addpd %xmm4, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] +; SSE-NEXT: subpd %xmm4, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: addpd %xmm5, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: subpd %xmm4, %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-NEXT: addpd %xmm3, %xmm5 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: subpd %xmm4, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: subpd %xmm4, %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: addpd %xmm3, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: retq ; ; VEX-LABEL: uitofp_4i64_to_4f64: Index: test/CodeGen/X86/vec_minmax_sint.ll =================================================================== --- test/CodeGen/X86/vec_minmax_sint.ll +++ test/CodeGen/X86/vec_minmax_sint.ll @@ -198,8 +198,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_gt_v4i32: @@ -228,14 +227,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_gt_v8i32: @@ -325,8 +322,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_gt_v16i8: @@ -355,14 +351,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_gt_v32i8: @@ -591,8 +585,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_ge_v4i32: @@ -621,14 +614,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_ge_v8i32: @@ -718,8 +709,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_ge_v16i8: @@ -748,14 +738,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_ge_v32i8: Index: test/CodeGen/X86/vec_minmax_uint.ll =================================================================== --- test/CodeGen/X86/vec_minmax_uint.ll +++ test/CodeGen/X86/vec_minmax_uint.ll @@ -251,23 +251,22 @@ define <8 x i32> @max_gt_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: max_gt_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_gt_v8i32: @@ -672,23 +671,22 @@ define <8 x i32> @max_ge_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: max_ge_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_ge_v8i32: Index: test/CodeGen/X86/vec_sdiv_to_shift.ll =================================================================== --- test/CodeGen/X86/vec_sdiv_to_shift.ll +++ test/CodeGen/X86/vec_sdiv_to_shift.ll @@ -9,9 +9,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psraw $15, %xmm1 ; SSE-NEXT: psrlw $11, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: psraw $5, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: psraw $5, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sdiv_vec8x16: @@ -32,9 +31,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psraw $15, %xmm1 ; SSE-NEXT: psrlw $11, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: psraw $5, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: psraw $5, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sdiv_vec8x16_minsize: @@ -55,9 +53,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrad $31, %xmm1 ; SSE-NEXT: psrld $28, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: psrad $4, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: psrad $4, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sdiv_vec4x32: @@ -104,15 +101,13 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrad $31, %xmm2 ; SSE-NEXT: psrld $26, %xmm2 -; SSE-NEXT: paddd %xmm0, %xmm2 -; SSE-NEXT: psrad $6, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrad $31, %xmm3 -; SSE-NEXT: psrld $26, %xmm3 -; SSE-NEXT: paddd %xmm1, %xmm3 -; SSE-NEXT: psrad $6, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: psrad $6, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: psrld $26, %xmm2 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: psrad $6, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: sdiv8x32: @@ -147,15 +142,13 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psraw $15, %xmm2 ; SSE-NEXT: psrlw $14, %xmm2 -; SSE-NEXT: paddw %xmm0, %xmm2 -; SSE-NEXT: psraw $2, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psraw $15, %xmm3 -; SSE-NEXT: psrlw $14, %xmm3 -; SSE-NEXT: paddw %xmm1, %xmm3 -; SSE-NEXT: psraw $2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm0 +; SSE-NEXT: psraw $2, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psraw $15, %xmm2 +; SSE-NEXT: psrlw $14, %xmm2 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: psraw $2, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: sdiv16x16: Index: test/CodeGen/X86/vec_shift6.ll =================================================================== --- test/CodeGen/X86/vec_shift6.ll +++ test/CodeGen/X86/vec_shift6.ll @@ -77,8 +77,7 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pslld $1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: test4: Index: test/CodeGen/X86/vector-bitreverse.ll =================================================================== --- test/CodeGen/X86/vector-bitreverse.ll +++ test/CodeGen/X86/vector-bitreverse.ll @@ -14,40 +14,36 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; SSE-LABEL: test_bitreverse_i8: ; SSE: # %bb.0: +; SSE-NEXT: rolb $4, %dil ; SSE-NEXT: movl %edi, %eax -; SSE-NEXT: rolb $4, %al -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andb $51, %cl -; SSE-NEXT: shlb $2, %cl -; SSE-NEXT: andb $-52, %al -; SSE-NEXT: shrb $2, %al -; SSE-NEXT: orb %cl, %al -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andb $85, %cl -; SSE-NEXT: addb %cl, %cl -; SSE-NEXT: andb $-86, %al -; SSE-NEXT: shrb %al -; SSE-NEXT: orb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: andb $51, %al +; SSE-NEXT: shlb $2, %al +; SSE-NEXT: andb $-52, %dil +; SSE-NEXT: shrb $2, %dil +; SSE-NEXT: orb %al, %dil +; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: andb $85, %al +; SSE-NEXT: addb %al, %al +; SSE-NEXT: andb $-86, %dil +; SSE-NEXT: shrb %dil +; SSE-NEXT: orb %dil, %al ; SSE-NEXT: retq ; ; AVX-LABEL: test_bitreverse_i8: ; AVX: # %bb.0: +; AVX-NEXT: rolb $4, %dil ; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: rolb $4, %al -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: andb $51, %cl -; AVX-NEXT: shlb $2, %cl -; AVX-NEXT: andb $-52, %al -; AVX-NEXT: shrb $2, %al -; AVX-NEXT: orb %cl, %al -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: andb $85, %cl -; AVX-NEXT: addb %cl, %cl -; AVX-NEXT: andb $-86, %al -; AVX-NEXT: shrb %al -; AVX-NEXT: orb %cl, %al -; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: andb $51, %al +; AVX-NEXT: shlb $2, %al +; AVX-NEXT: andb $-52, %dil +; AVX-NEXT: shrb $2, %dil +; AVX-NEXT: orb %al, %dil +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: andb $85, %al +; AVX-NEXT: addb %al, %al +; AVX-NEXT: andb $-86, %dil +; AVX-NEXT: shrb %dil +; AVX-NEXT: orb %dil, %al ; AVX-NEXT: retq ; ; XOP-LABEL: test_bitreverse_i8: Index: test/CodeGen/X86/vector-idiv-sdiv-128.ll =================================================================== --- test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -510,8 +510,7 @@ ; SSE2-NEXT: psllw $3, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: psubb %xmm2, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_rem7_16i8: Index: test/CodeGen/X86/vector-idiv-udiv-128.ll =================================================================== --- test/CodeGen/X86/vector-idiv-udiv-128.ll +++ test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -379,8 +379,7 @@ ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pslld $3, %xmm2 ; SSE2-NEXT: psubd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_rem7_4i32: @@ -489,34 +488,32 @@ ; SSE2-NEXT: psllw $3, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: psubb %xmm2, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_rem7_16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37] -; SSE41-NEXT: pmullw %xmm1, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37] +; SSE41-NEXT: pmullw %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: pmullw %xmm1, %xmm3 +; SSE41-NEXT: pmullw %xmm2, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psubb %xmm2, %xmm1 -; SSE41-NEXT: psrlw $1, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 -; SSE41-NEXT: psrlw $2, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psllw $3, %xmm2 +; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psubb %xmm1, %xmm2 +; SSE41-NEXT: psrlw $1, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: paddb %xmm1, %xmm2 +; SSE41-NEXT: psrlw $2, %xmm2 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: psubb %xmm2, %xmm1 -; SSE41-NEXT: paddb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psllw $3, %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: psubb %xmm1, %xmm2 +; SSE41-NEXT: paddb %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_rem7_16i8: Index: test/CodeGen/X86/vector-lzcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-lzcnt-128.ll +++ test/CodeGen/X86/vector-lzcnt-128.ll @@ -126,8 +126,7 @@ ; SSSE3-NEXT: psrlq $32, %xmm0 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: psrlq $32, %xmm1 -; SSSE3-NEXT: paddq %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv2i64: @@ -162,8 +161,7 @@ ; SSE41-NEXT: psrlq $32, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm1 -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv2i64: @@ -271,8 +269,7 @@ ; X32-SSE-NEXT: psrlq $32, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrlq $32, %xmm1 -; X32-SSE-NEXT: paddq %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: paddq %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0) @@ -392,8 +389,7 @@ ; SSSE3-NEXT: psrlq $32, %xmm0 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: psrlq $32, %xmm1 -; SSSE3-NEXT: paddq %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv2i64u: @@ -428,8 +424,7 @@ ; SSE41-NEXT: psrlq $32, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm1 -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv2i64u: @@ -537,8 +532,7 @@ ; X32-SSE-NEXT: psrlq $32, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrlq $32, %xmm1 -; X32-SSE-NEXT: paddq %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: paddq %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 -1) @@ -632,62 +626,60 @@ ; ; SSSE3-LABEL: testv4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pshufb %xmm1, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: paddb %xmm3, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: psrlw $4, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: paddb %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm3 ; SSSE3-NEXT: psrlw $8, %xmm3 -; SSSE3-NEXT: pand %xmm1, %xmm3 -; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm3, %xmm1 -; SSSE3-NEXT: pcmpeqw %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: psrlw $8, %xmm2 +; SSSE3-NEXT: paddw %xmm3, %xmm2 +; SSSE3-NEXT: pcmpeqw %xmm1, %xmm0 ; SSSE3-NEXT: psrld $16, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: psrld $16, %xmm1 -; SSSE3-NEXT: paddd %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: psrld $16, %xmm2 +; SSSE3-NEXT: paddd %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pshufb %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrlw $4, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqb %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: paddb %xmm3, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqb %xmm1, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm1 -; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: paddw %xmm3, %xmm2 +; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 ; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: paddd %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv4i32: @@ -755,32 +747,31 @@ ; ; X32-SSE-LABEL: testv4i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm4 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlw $4, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pshufb %xmm1, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm4, %xmm1 -; X32-SSE-NEXT: paddb %xmm3, %xmm1 +; X32-SSE-NEXT: pshufb %xmm2, %xmm4 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psrlw $4, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: pshufb %xmm2, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm1, %xmm2 +; X32-SSE-NEXT: pand %xmm4, %xmm2 +; X32-SSE-NEXT: paddb %xmm3, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm1, %xmm3 ; X32-SSE-NEXT: psrlw $8, %xmm3 -; X32-SSE-NEXT: pand %xmm1, %xmm3 -; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm3, %xmm1 -; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: paddw %xmm3, %xmm2 +; X32-SSE-NEXT: pcmpeqw %xmm1, %xmm0 ; X32-SSE-NEXT: psrld $16, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: psrld $16, %xmm1 -; X32-SSE-NEXT: paddd %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: psrld $16, %xmm2 +; X32-SSE-NEXT: paddd %xmm2, %xmm0 ; X32-SSE-NEXT: retl %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 0) @@ -874,62 +865,60 @@ ; ; SSSE3-LABEL: testv4i32u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pshufb %xmm1, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: paddb %xmm3, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: psrlw $4, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: paddb %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm3 ; SSSE3-NEXT: psrlw $8, %xmm3 -; SSSE3-NEXT: pand %xmm1, %xmm3 -; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm3, %xmm1 -; SSSE3-NEXT: pcmpeqw %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: psrlw $8, %xmm2 +; SSSE3-NEXT: paddw %xmm3, %xmm2 +; SSSE3-NEXT: pcmpeqw %xmm1, %xmm0 ; SSSE3-NEXT: psrld $16, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: psrld $16, %xmm1 -; SSSE3-NEXT: paddd %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: psrld $16, %xmm2 +; SSSE3-NEXT: paddd %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv4i32u: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pshufb %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrlw $4, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqb %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: paddb %xmm3, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqb %xmm1, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm1 -; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: paddw %xmm3, %xmm2 +; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 ; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: paddd %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv4i32u: @@ -997,32 +986,31 @@ ; ; X32-SSE-LABEL: testv4i32u: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm4 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlw $4, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pshufb %xmm1, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm4, %xmm1 -; X32-SSE-NEXT: paddb %xmm3, %xmm1 +; X32-SSE-NEXT: pshufb %xmm2, %xmm4 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psrlw $4, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: pshufb %xmm2, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm1, %xmm2 +; X32-SSE-NEXT: pand %xmm4, %xmm2 +; X32-SSE-NEXT: paddb %xmm3, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm1, %xmm3 ; X32-SSE-NEXT: psrlw $8, %xmm3 -; X32-SSE-NEXT: pand %xmm1, %xmm3 -; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm3, %xmm1 -; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: paddw %xmm3, %xmm2 +; X32-SSE-NEXT: pcmpeqw %xmm1, %xmm0 ; X32-SSE-NEXT: psrld $16, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: psrld $16, %xmm1 -; X32-SSE-NEXT: paddd %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: psrld $16, %xmm2 +; X32-SSE-NEXT: paddd %xmm2, %xmm0 ; X32-SSE-NEXT: retl %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 -1) @@ -1104,50 +1092,48 @@ ; ; SSSE3-LABEL: testv8i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pshufb %xmm1, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: paddb %xmm3, %xmm1 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: psrlw $4, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: paddb %xmm3, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: psrlw $8, %xmm2 +; SSSE3-NEXT: paddw %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pshufb %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE41-NEXT: pshufb %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrlw $4, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqb %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: paddb %xmm3, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm1, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: paddw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv8i16: @@ -1210,26 +1196,25 @@ ; ; X32-SSE-LABEL: testv8i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm4 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlw $4, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pshufb %xmm1, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm4, %xmm1 -; X32-SSE-NEXT: paddb %xmm3, %xmm1 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; X32-SSE-NEXT: pshufb %xmm2, %xmm4 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psrlw $4, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: pshufb %xmm2, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm1, %xmm2 +; X32-SSE-NEXT: pand %xmm4, %xmm2 +; X32-SSE-NEXT: paddb %xmm3, %xmm2 +; X32-SSE-NEXT: pcmpeqb %xmm1, %xmm0 ; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: paddw %xmm2, %xmm0 ; X32-SSE-NEXT: retl %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0) ret <8 x i16> %out @@ -1310,50 +1295,48 @@ ; ; SSSE3-LABEL: testv8i16u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pshufb %xmm1, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: paddb %xmm3, %xmm1 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: psrlw $4, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: paddb %xmm3, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: psrlw $8, %xmm2 +; SSSE3-NEXT: paddw %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv8i16u: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pshufb %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE41-NEXT: pshufb %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrlw $4, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqb %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: paddb %xmm3, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm1, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: paddw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv8i16u: @@ -1416,26 +1399,25 @@ ; ; X32-SSE-LABEL: testv8i16u: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm4 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlw $4, %xmm1 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pshufb %xmm1, %xmm3 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; X32-SSE-NEXT: pand %xmm4, %xmm1 -; X32-SSE-NEXT: paddb %xmm3, %xmm1 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; X32-SSE-NEXT: pshufb %xmm2, %xmm4 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psrlw $4, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: pshufb %xmm2, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm1, %xmm2 +; X32-SSE-NEXT: pand %xmm4, %xmm2 +; X32-SSE-NEXT: paddb %xmm3, %xmm2 +; X32-SSE-NEXT: pcmpeqb %xmm1, %xmm0 ; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: paddw %xmm2, %xmm0 ; X32-SSE-NEXT: retl %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1) ret <8 x i16> %out Index: test/CodeGen/X86/vector-mul.ll =================================================================== --- test/CodeGen/X86/vector-mul.ll +++ test/CodeGen/X86/vector-mul.ll @@ -234,16 +234,14 @@ ; X86: # %bb.0: ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psllq $4, %xmm1 -; X86-NEXT: paddq %xmm0, %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm0 +; X86-NEXT: paddq %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v2i64_17: ; X64: # %bb.0: ; X64-NEXT: movdqa %xmm0, %xmm1 ; X64-NEXT: psllq $4, %xmm1 -; X64-NEXT: paddq %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: paddq %xmm1, %xmm0 ; X64-NEXT: retq ; ; X64-AVX-LABEL: mul_v2i64_17: @@ -305,8 +303,7 @@ ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psllw $4, %xmm1 ; X86-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X86-NEXT: paddb %xmm0, %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm0 +; X86-NEXT: paddb %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v16i8_17: @@ -314,8 +311,7 @@ ; X64-NEXT: movdqa %xmm0, %xmm1 ; X64-NEXT: psllw $4, %xmm1 ; X64-NEXT: pand {{.*}}(%rip), %xmm1 -; X64-NEXT: paddb %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: paddb %xmm1, %xmm0 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v16i8_17: Index: test/CodeGen/X86/vector-popcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-popcnt-128.ll +++ test/CodeGen/X86/vector-popcnt-128.ll @@ -25,11 +25,10 @@ ; SSE2-NEXT: paddq %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlq $4, %xmm1 -; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv2i64: @@ -46,11 +45,10 @@ ; SSE3-NEXT: paddq %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlq $4, %xmm1 -; SSE3-NEXT: paddq %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: psadbw %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv2i64: @@ -499,9 +497,8 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv16i8: @@ -518,9 +515,8 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv16i8: Index: test/CodeGen/X86/vector-reduce-fadd-fast.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -182,8 +182,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_zero: @@ -352,8 +351,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_undef: @@ -691,8 +689,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movapd %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f64_zero: @@ -719,8 +716,7 @@ ; SSE2-NEXT: addpd %xmm1, %xmm0 ; SSE2-NEXT: movapd %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f64_zero: @@ -758,8 +754,7 @@ ; SSE2-NEXT: addpd %xmm1, %xmm0 ; SSE2-NEXT: movapd %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f64_zero: @@ -860,8 +855,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movapd %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f64_undef: @@ -888,8 +882,7 @@ ; SSE2-NEXT: addpd %xmm1, %xmm0 ; SSE2-NEXT: movapd %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f64_undef: @@ -927,8 +920,7 @@ ; SSE2-NEXT: addpd %xmm1, %xmm0 ; SSE2-NEXT: movapd %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f64_undef: Index: test/CodeGen/X86/vector-reduce-fmul-fast.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -192,8 +192,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE2-NEXT: mulps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: mulps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_zero: @@ -234,8 +233,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32_zero: @@ -276,8 +274,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32_zero: @@ -330,8 +327,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32_zero: @@ -373,8 +369,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE2-NEXT: mulps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: mulps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_undef: @@ -415,8 +410,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32_undef: @@ -457,8 +451,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32_undef: @@ -511,8 +504,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32_undef: @@ -694,8 +686,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_zero: @@ -719,8 +710,7 @@ ; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_zero: @@ -754,8 +744,7 @@ ; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_zero: @@ -837,8 +826,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_undef: @@ -862,8 +850,7 @@ ; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_undef: @@ -897,8 +884,7 @@ ; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_undef: Index: test/CodeGen/X86/vector-reduce-fmul.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fmul.ll +++ test/CodeGen/X86/vector-reduce-fmul.ll @@ -339,8 +339,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE2-NEXT: mulss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_one: @@ -1184,8 +1183,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_one: @@ -1208,11 +1206,10 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: mulsd %xmm0, %xmm2 -; SSE-NEXT: mulsd %xmm1, %xmm2 +; SSE-NEXT: mulsd %xmm2, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: mulsd %xmm1, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_one: @@ -1245,17 +1242,16 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: mulsd %xmm0, %xmm4 -; SSE-NEXT: mulsd %xmm1, %xmm4 +; SSE-NEXT: mulsd %xmm4, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: mulsd %xmm1, %xmm4 -; SSE-NEXT: mulsd %xmm2, %xmm4 +; SSE-NEXT: mulsd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm2, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: mulsd %xmm2, %xmm4 -; SSE-NEXT: mulsd %xmm3, %xmm4 +; SSE-NEXT: mulsd %xmm2, %xmm0 +; SSE-NEXT: mulsd %xmm3, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; SSE-NEXT: mulsd %xmm3, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm0 +; SSE-NEXT: mulsd %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_one: @@ -1302,7 +1298,7 @@ ; SSE-LABEL: test_v16f64_one: ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: mulsd %xmm8, %xmm0 ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] Index: test/CodeGen/X86/vector-shift-ashr-128.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-128.ll +++ test/CodeGen/X86/vector-shift-ashr-128.ll @@ -1099,11 +1099,11 @@ ; SSE2-NEXT: psraw $2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm0, %xmm1 ; SSE2-NEXT: psraw $1, %xmm2 -; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 ; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1185,11 +1185,11 @@ ; X32-SSE-NEXT: psraw $2, %xmm1 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] ; X32-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X32-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; X32-SSE-NEXT: movaps %xmm2, %xmm0 -; X32-SSE-NEXT: andps %xmm1, %xmm0 +; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] +; X32-SSE-NEXT: movaps %xmm2, %xmm1 +; X32-SSE-NEXT: andps %xmm0, %xmm1 ; X32-SSE-NEXT: psraw $1, %xmm2 -; X32-SSE-NEXT: andnps %xmm2, %xmm1 +; X32-SSE-NEXT: andnps %xmm2, %xmm0 ; X32-SSE-NEXT: orps %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <8 x i16> %a, Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -2730,8 +2730,7 @@ ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE2-NEXT: addps %xmm0, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: addps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: PR22390: @@ -2739,8 +2738,7 @@ ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] ; SSSE3-NEXT: movaps %xmm0, %xmm2 ; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSSE3-NEXT: addps %xmm0, %xmm2 -; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: addps %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: PR22390: Index: test/CodeGen/X86/vselect-minmax.ll =================================================================== --- test/CodeGen/X86/vselect-minmax.ll +++ test/CodeGen/X86/vselect-minmax.ll @@ -63,8 +63,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test3: @@ -89,8 +88,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test4: @@ -393,8 +391,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test19: @@ -419,8 +416,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test20: @@ -647,14 +643,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test27: @@ -694,14 +688,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test28: @@ -1255,14 +1247,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test43: @@ -1302,14 +1292,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test44: @@ -1447,23 +1435,22 @@ define <8 x i32> @test47(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: test47: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test47: @@ -1499,23 +1486,22 @@ define <8 x i32> @test48(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: test48: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test48: @@ -1555,8 +1541,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test49: @@ -1581,8 +1566,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test50: @@ -1885,8 +1869,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test65: @@ -1911,8 +1894,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test66: @@ -2099,14 +2081,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test73: @@ -2146,14 +2126,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test74: @@ -2707,14 +2685,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test89: @@ -2754,14 +2730,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test90: @@ -2887,23 +2861,22 @@ define <8 x i32> @test93(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: test93: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test93: @@ -2939,23 +2912,22 @@ define <8 x i32> @test94(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: test94: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test94: @@ -3225,26 +3197,22 @@ ; SSE2-NEXT: pcmpgtb %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test99: @@ -3292,26 +3260,22 @@ ; SSE2-NEXT: pcmpgtb %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test100: @@ -4037,26 +4001,22 @@ ; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test115: @@ -4104,26 +4064,22 @@ ; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test116: @@ -4317,40 +4273,38 @@ define <16 x i32> @test119(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: test119: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm8 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm9 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test119: @@ -4394,40 +4348,38 @@ define <16 x i32> @test120(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: test120: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm8 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm9 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test120: @@ -5555,26 +5507,22 @@ ; SSE2-NEXT: pcmpgtb %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test129: @@ -5622,26 +5570,22 @@ ; SSE2-NEXT: pcmpgtb %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test130: @@ -6367,26 +6311,22 @@ ; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test145: @@ -6434,26 +6374,22 @@ ; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test146: @@ -6623,40 +6559,38 @@ define <16 x i32> @test149(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: test149: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm8 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm9 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test149: @@ -6700,40 +6634,38 @@ define <16 x i32> @test150(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: test150: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm8 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm9 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test150: Index: test/CodeGen/X86/x86-shifts.ll =================================================================== --- test/CodeGen/X86/x86-shifts.ll +++ test/CodeGen/X86/x86-shifts.ll @@ -219,26 +219,24 @@ define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind { ; X32-LABEL: shr2_nosplat: ; X32: # %bb.0: # %entry -; X32-NEXT: movdqa %xmm0, %xmm2 -; X32-NEXT: psrlq $8, %xmm2 ; X32-NEXT: movdqa %xmm0, %xmm1 -; X32-NEXT: psrlq $1, %xmm1 -; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; X32-NEXT: xorpd %xmm0, %xmm1 -; X32-NEXT: movapd %xmm1, %xmm0 +; X32-NEXT: psrlq $8, %xmm1 +; X32-NEXT: movdqa %xmm0, %xmm2 +; X32-NEXT: psrlq $1, %xmm2 +; X32-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; X32-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; X32-NEXT: xorpd %xmm2, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: shr2_nosplat: ; X64: # %bb.0: # %entry -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: psrlq $8, %xmm2 ; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psrlq $1, %xmm1 -; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X64-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; X64-NEXT: xorpd %xmm0, %xmm1 -; X64-NEXT: movapd %xmm1, %xmm0 +; X64-NEXT: psrlq $8, %xmm1 +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: psrlq $1, %xmm2 +; X64-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; X64-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; X64-NEXT: xorpd %xmm2, %xmm0 ; X64-NEXT: retq entry: %B = lshr <2 x i64> %A, < i64 8, i64 1>