Index: lib/CodeGen/TwoAddressInstructionPass.cpp =================================================================== --- lib/CodeGen/TwoAddressInstructionPass.cpp +++ lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1593,6 +1593,10 @@ if (MO.isKill()) { MO.setIsKill(false); RemovedKillFlag = true; + // Propagate SrcRegMap. We do this only when the operand is killed since + // otherwise this copy is likely unavoidable and we don't want to + // propagate a physical register through it. + SrcRegMap[RegA] = RegB; } // Make sure regA is a legal regclass for the SrcIdx operand. @@ -1604,9 +1608,6 @@ // by SubRegB is compatible with RegA with no subregister. So regardless of // whether the dest oper writes a subreg, the source oper should not. MO.setSubReg(0); - - // Propagate SrcRegMap. - SrcRegMap[RegA] = RegB; } if (AllUsesCopied) { Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -151,10 +151,9 @@ ; CHECK-LABEL: mand16: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: andl %esi, %ecx -; CHECK-NEXT: xorl %esi, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: andl %esi, %eax +; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: orl %edi, %eax ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq ; Index: test/CodeGen/X86/avx512-regcall-NoMask.ll =================================================================== --- test/CodeGen/X86/avx512-regcall-NoMask.ll +++ test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -970,45 +970,46 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: subl $20, %esp ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %esi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %ebx ; X32-NEXT: subl %ecx, %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl %esi, %ebp -; X32-NEXT: subl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: imull %ebp, %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: subl %edi, %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: imull %ebp, %ecx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: movl %edi, %ebp ; X32-NEXT: subl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: imull %ebx, %ebp +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: subl %esi, %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: subl {{[0-9]+}}(%esp), %eax -; X32-NEXT: imull %ebp, %eax -; X32-NEXT: addl %eax, %ebx +; X32-NEXT: imull %ebx, %eax +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %esi +; X32-NEXT: subl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl %ebx, %edx +; X32-NEXT: subl {{[0-9]+}}(%esp), %edx +; X32-NEXT: imull %esi, %edx +; X32-NEXT: addl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl (%esp), %ebp # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp ; X32-NEXT: addl {{[0-9]+}}(%esp), %edi -; X32-NEXT: addl {{[0-9]+}}(%esp), %esi -; X32-NEXT: imull %eax, %esi -; X32-NEXT: addl {{[0-9]+}}(%esp), %edx -; X32-NEXT: imull %ebp, %edx -; X32-NEXT: addl %esi, %edx +; X32-NEXT: imull %eax, %edi ; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: imull %edi, %ecx -; X32-NEXT: addl %edx, %ecx +; X32-NEXT: imull %esi, %ecx +; X32-NEXT: addl %edi, %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: imull %ebp, %ebx ; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: addl %edx, %ebx ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: addl $20, %esp ; X32-NEXT: popl %ebx @@ -1020,35 +1021,40 @@ ; WIN64-NEXT: pushq %r13 ; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx -; WIN64-NEXT: movl %eax, %r13d -; WIN64-NEXT: subl %ecx, %eax -; WIN64-NEXT: movl %edx, %ebp -; WIN64-NEXT: subl %edi, %ebp -; WIN64-NEXT: movl %r9d, %ebx -; WIN64-NEXT: subl %r10d, %ebx -; WIN64-NEXT: imull %ebx, %eax -; WIN64-NEXT: movl %r11d, %ebx -; WIN64-NEXT: subl %r12d, %ebx -; WIN64-NEXT: imull %ebp, %ebx -; WIN64-NEXT: movl %esi, %ebp -; WIN64-NEXT: subl %r8d, %ebp -; WIN64-NEXT: addl %ebx, %eax +; WIN64-NEXT: subq $4, %rsp +; WIN64-NEXT: movl %ecx, %ebp +; WIN64-NEXT: movl %ecx, (%rsp) # 4-byte Spill +; WIN64-NEXT: movl %r11d, %ecx +; WIN64-NEXT: movl %eax, %ebx +; WIN64-NEXT: subl %ebp, %ebx +; WIN64-NEXT: movl %edx, %r13d +; WIN64-NEXT: subl %edi, %r13d +; WIN64-NEXT: movl %r9d, %ebp +; WIN64-NEXT: subl %r10d, %ebp +; WIN64-NEXT: imull %ebx, %ebp +; WIN64-NEXT: subl %r12d, %r11d +; WIN64-NEXT: imull %r13d, %r11d +; WIN64-NEXT: movl %esi, %r13d +; WIN64-NEXT: subl %r8d, %r13d +; WIN64-NEXT: addl %ebp, %r11d ; WIN64-NEXT: movl %r14d, %ebx ; WIN64-NEXT: subl %r15d, %ebx -; WIN64-NEXT: imull %ebp, %ebx -; WIN64-NEXT: addl %ebx, %eax -; WIN64-NEXT: addl %ecx, %r13d +; WIN64-NEXT: imull %r13d, %ebx +; WIN64-NEXT: addl %r11d, %ebx +; WIN64-NEXT: addl (%rsp), %eax # 4-byte Folded Reload ; WIN64-NEXT: addl %edi, %edx ; WIN64-NEXT: addl %r8d, %esi ; WIN64-NEXT: addl %r10d, %r9d -; WIN64-NEXT: imull %r13d, %r9d -; WIN64-NEXT: addl %r12d, %r11d -; WIN64-NEXT: imull %edx, %r11d -; WIN64-NEXT: addl %r9d, %r11d +; WIN64-NEXT: imull %eax, %r9d +; WIN64-NEXT: addl %r12d, %ecx +; WIN64-NEXT: imull %edx, %ecx +; WIN64-NEXT: addl %r9d, %ecx ; WIN64-NEXT: addl %r15d, %r14d ; WIN64-NEXT: imull %esi, %r14d -; WIN64-NEXT: addl %r11d, %r14d -; WIN64-NEXT: addl %r14d, %eax +; WIN64-NEXT: addl %ecx, %r14d +; WIN64-NEXT: addl %r14d, %ebx +; WIN64-NEXT: movl %ebx, %eax +; WIN64-NEXT: addq $4, %rsp ; WIN64-NEXT: popq %rbx ; WIN64-NEXT: popq %rbp ; WIN64-NEXT: popq %r13 @@ -1058,36 +1064,38 @@ ; LINUXOSX64: # %bb.0: ; LINUXOSX64-NEXT: pushq %rbp ; LINUXOSX64-NEXT: pushq %rbx +; LINUXOSX64-NEXT: movl %ecx, %r11d +; LINUXOSX64-NEXT: movl %r13d, %ecx ; LINUXOSX64-NEXT: movl %eax, %r10d -; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; LINUXOSX64-NEXT: subl %ecx, %eax +; LINUXOSX64-NEXT: subl %r11d, %r10d ; LINUXOSX64-NEXT: movl %edx, %ebx ; LINUXOSX64-NEXT: subl %edi, %ebx ; LINUXOSX64-NEXT: movl %r9d, %ebp ; LINUXOSX64-NEXT: subl %r12d, %ebp -; LINUXOSX64-NEXT: imull %ebp, %eax -; LINUXOSX64-NEXT: movl %r13d, %ebp -; LINUXOSX64-NEXT: subl %r14d, %ebp -; LINUXOSX64-NEXT: imull %ebx, %ebp +; LINUXOSX64-NEXT: imull %r10d, %ebp +; LINUXOSX64-NEXT: subl %r14d, %r13d +; LINUXOSX64-NEXT: imull %ebx, %r13d ; LINUXOSX64-NEXT: movl %esi, %ebx ; LINUXOSX64-NEXT: subl %r8d, %ebx -; LINUXOSX64-NEXT: addl %ebp, %eax -; LINUXOSX64-NEXT: movl %r15d, %ebp -; LINUXOSX64-NEXT: subl %r11d, %ebp -; LINUXOSX64-NEXT: imull %ebx, %ebp -; LINUXOSX64-NEXT: addl %ebp, %eax -; LINUXOSX64-NEXT: addl %ecx, %r10d +; LINUXOSX64-NEXT: addl %ebp, %r13d +; LINUXOSX64-NEXT: movl %r15d, %r10d +; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %ebp +; LINUXOSX64-NEXT: subl %ebp, %r10d +; LINUXOSX64-NEXT: imull %ebx, %r10d +; LINUXOSX64-NEXT: addl %r13d, %r10d +; LINUXOSX64-NEXT: addl %r11d, %eax ; LINUXOSX64-NEXT: addl %edi, %edx ; LINUXOSX64-NEXT: addl %r8d, %esi ; LINUXOSX64-NEXT: addl %r12d, %r9d -; LINUXOSX64-NEXT: imull %r10d, %r9d -; LINUXOSX64-NEXT: addl %r14d, %r13d -; LINUXOSX64-NEXT: imull %edx, %r13d -; LINUXOSX64-NEXT: addl %r9d, %r13d -; LINUXOSX64-NEXT: addl %r11d, %r15d +; LINUXOSX64-NEXT: imull %eax, %r9d +; LINUXOSX64-NEXT: addl %r14d, %ecx +; LINUXOSX64-NEXT: imull %edx, %ecx +; LINUXOSX64-NEXT: addl %r9d, %ecx +; LINUXOSX64-NEXT: addl %ebp, %r15d ; LINUXOSX64-NEXT: imull %esi, %r15d -; LINUXOSX64-NEXT: addl %r13d, %r15d -; LINUXOSX64-NEXT: addl %r15d, %eax +; LINUXOSX64-NEXT: addl %ecx, %r15d +; LINUXOSX64-NEXT: addl %r15d, %r10d +; LINUXOSX64-NEXT: movl %r10d, %eax ; LINUXOSX64-NEXT: popq %rbx ; LINUXOSX64-NEXT: popq %rbp ; LINUXOSX64-NEXT: retq Index: test/CodeGen/X86/avx512-schedule.ll =================================================================== --- test/CodeGen/X86/avx512-schedule.ll +++ test/CodeGen/X86/avx512-schedule.ll @@ -6792,20 +6792,18 @@ ; GENERIC-LABEL: mand16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33] -; GENERIC-NEXT: movl %edi, %ecx # sched: [1:0.33] -; GENERIC-NEXT: andl %esi, %ecx # sched: [1:0.33] -; GENERIC-NEXT: xorl %esi, %eax # sched: [1:0.33] -; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33] +; GENERIC-NEXT: andl %esi, %eax # sched: [1:0.33] +; GENERIC-NEXT: xorl %esi, %edi # sched: [1:0.33] +; GENERIC-NEXT: orl %edi, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def $ax killed $ax killed $eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mand16: ; SKX: # %bb.0: ; SKX-NEXT: movl %edi, %eax # sched: [1:0.25] -; SKX-NEXT: movl %edi, %ecx # sched: [1:0.25] -; SKX-NEXT: andl %esi, %ecx # sched: [1:0.25] -; SKX-NEXT: xorl %esi, %eax # sched: [1:0.25] -; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25] +; SKX-NEXT: andl %esi, %eax # sched: [1:0.25] +; SKX-NEXT: xorl %esi, %edi # sched: [1:0.25] +; SKX-NEXT: orl %edi, %eax # sched: [1:0.25] ; SKX-NEXT: # kill: def $ax killed $ax killed $eax ; SKX-NEXT: retq # sched: [7:1.00] %ma = bitcast i16 %x to <16 x i1> Index: test/CodeGen/X86/avx512bw-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512bw-mask-op.ll +++ test/CodeGen/X86/avx512bw-mask-op.ll @@ -79,10 +79,9 @@ ; CHECK-LABEL: mand32: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: andl %esi, %ecx -; CHECK-NEXT: xorl %esi, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: andl %esi, %eax +; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: orl %edi, %eax ; CHECK-NEXT: retq %ma = bitcast i32 %x to <32 x i1> %mb = bitcast i32 %y to <32 x i1> @@ -116,10 +115,9 @@ ; CHECK-LABEL: mand64: ; CHECK: ## %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq %rdi, %rcx -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: xorq %rsi, %rax -; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: xorq %rsi, %rdi +; CHECK-NEXT: orq %rdi, %rax ; CHECK-NEXT: retq %ma = bitcast i64 %x to <64 x i1> %mb = bitcast i64 %y to <64 x i1> Index: test/CodeGen/X86/avx512dq-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512dq-mask-op.ll +++ test/CodeGen/X86/avx512dq-mask-op.ll @@ -33,11 +33,9 @@ ; CHECK-LABEL: mand8: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: andb %sil, %cl -; CHECK-NEXT: xorb %sil, %al -; CHECK-NEXT: orb %cl, %al -; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: andb %sil, %al +; CHECK-NEXT: xorb %sil, %dil +; CHECK-NEXT: orb %dil, %al ; CHECK-NEXT: retq %ma = bitcast i8 %x to <8 x i1> %mb = bitcast i8 %y to <8 x i1> Index: test/CodeGen/X86/bitreverse.ll =================================================================== --- test/CodeGen/X86/bitreverse.ll +++ test/CodeGen/X86/bitreverse.ll @@ -341,21 +341,19 @@ ; ; X64-LABEL: test_bitreverse_i8: ; X64: # %bb.0: +; X64-NEXT: rolb $4, %dil ; X64-NEXT: movl %edi, %eax -; X64-NEXT: rolb $4, %al -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andb $51, %cl -; X64-NEXT: shlb $2, %cl -; X64-NEXT: andb $-52, %al -; X64-NEXT: shrb $2, %al -; X64-NEXT: orb %cl, %al -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andb $85, %cl -; X64-NEXT: addb %cl, %cl -; X64-NEXT: andb $-86, %al -; X64-NEXT: shrb %al -; X64-NEXT: orb %cl, %al -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: andb $51, %al +; X64-NEXT: shlb $2, %al +; X64-NEXT: andb $-52, %dil +; X64-NEXT: shrb $2, %dil +; X64-NEXT: orb %al, %dil +; X64-NEXT: movl %edi, %eax +; X64-NEXT: andb $85, %al +; X64-NEXT: addb %al, %al +; X64-NEXT: andb $-86, %dil +; X64-NEXT: shrb %dil +; X64-NEXT: orb %dil, %al ; X64-NEXT: retq %b = call i8 @llvm.bitreverse.i8(i8 %a) ret i8 %b @@ -385,22 +383,20 @@ ; ; X64-LABEL: test_bitreverse_i4: ; X64: # %bb.0: +; X64-NEXT: rolb $4, %dil ; X64-NEXT: movl %edi, %eax -; X64-NEXT: rolb $4, %al -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andb $51, %cl -; X64-NEXT: shlb $2, %cl -; X64-NEXT: andb $-52, %al -; X64-NEXT: shrb $2, %al -; X64-NEXT: orb %cl, %al -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andb $80, %cl -; X64-NEXT: addb %cl, %cl -; X64-NEXT: andb $-96, %al -; X64-NEXT: shrb %al -; X64-NEXT: orb %cl, %al +; X64-NEXT: andb $51, %al +; X64-NEXT: shlb $2, %al +; X64-NEXT: andb $-52, %dil +; X64-NEXT: shrb $2, %dil +; X64-NEXT: orb %al, %dil +; X64-NEXT: movl %edi, %eax +; X64-NEXT: andb $80, %al +; X64-NEXT: addb %al, %al +; X64-NEXT: andb $-96, %dil +; X64-NEXT: shrb %dil +; X64-NEXT: orb %dil, %al ; X64-NEXT: shrb $4, %al -; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %b = call i4 @llvm.bitreverse.i4(i4 %a) ret i4 %b Index: test/CodeGen/X86/bswap_tree2.ll =================================================================== --- test/CodeGen/X86/bswap_tree2.ll +++ test/CodeGen/X86/bswap_tree2.ll @@ -24,17 +24,16 @@ ; ; CHECK64-LABEL: test1: ; CHECK64: # %bb.0: -; CHECK64-NEXT: movl %edi, %eax ; CHECK64-NEXT: movl %edi, %ecx ; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000 -; CHECK64-NEXT: movl %edi, %edx -; CHECK64-NEXT: orl $-16777216, %edx # imm = 0xFF000000 +; CHECK64-NEXT: movl %edi, %eax +; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 ; CHECK64-NEXT: shll $8, %ecx -; CHECK64-NEXT: shrl $8, %edx -; CHECK64-NEXT: orl %ecx, %edx -; CHECK64-NEXT: bswapl %eax -; CHECK64-NEXT: shrl $16, %eax -; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: shrl $8, %eax +; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: bswapl %edi +; CHECK64-NEXT: shrl $16, %edi +; CHECK64-NEXT: orl %edi, %eax ; CHECK64-NEXT: retq %byte0 = and i32 %x, 255 ; 0x000000ff %byte1 = and i32 %x, 65280 ; 0x0000ff00 @@ -81,7 +80,7 @@ ; CHECK64-NEXT: andl $-16777216, %edi # imm = 0xFF000000 ; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000 ; CHECK64-NEXT: orl %edi, %eax -; CHECK64-NEXT: leal (%rax,%rcx), %eax +; CHECK64-NEXT: orl %ecx, %eax ; CHECK64-NEXT: retq %byte1 = lshr i32 %x, 8 %byte0 = shl i32 %x, 8 Index: test/CodeGen/X86/bypass-slow-division-32.ll =================================================================== --- test/CodeGen/X86/bypass-slow-division-32.ll +++ test/CodeGen/X86/bypass-slow-division-32.ll @@ -143,7 +143,7 @@ ; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: shrl $31, %eax ; CHECK-NEXT: sarl $3, %edx -; CHECK-NEXT: leal (%edx,%eax), %eax +; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retl %resultdiv = sdiv i32 %a, 33 ret i32 %resultdiv Index: test/CodeGen/X86/combine-fcopysign.ll =================================================================== --- test/CodeGen/X86/combine-fcopysign.ll +++ test/CodeGen/X86/combine-fcopysign.ll @@ -193,38 +193,36 @@ define <4 x double> @combine_vec_fcopysign_fpext_sgn(<4 x double> %x, <4 x float> %y) { ; SSE-LABEL: combine_vec_fcopysign_fpext_sgn: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: cvtss2sd %xmm2, %xmm4 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtss2sd %xmm2, %xmm0 ; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm7 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: andps %xmm7, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1,2,3] +; SSE-NEXT: movaps {{.*#+}} xmm6 = [NaN,NaN] +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: andps %xmm6, %xmm7 ; SSE-NEXT: movaps {{.*#+}} xmm8 = [-0.0E+0,-0.0E+0] -; SSE-NEXT: andps %xmm8, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: andps %xmm7, %xmm0 -; SSE-NEXT: xorps %xmm4, %xmm4 -; SSE-NEXT: cvtss2sd %xmm5, %xmm4 -; SSE-NEXT: andps %xmm8, %xmm4 -; SSE-NEXT: orps %xmm0, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: andps %xmm7, %xmm0 -; SSE-NEXT: cvtss2sd %xmm3, %xmm3 -; SSE-NEXT: andps %xmm8, %xmm3 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: andps %xmm7, %xmm1 -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtss2sd %xmm6, %xmm0 ; SSE-NEXT: andps %xmm8, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: orps %xmm7, %xmm0 +; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: andps %xmm6, %xmm3 +; SSE-NEXT: cvtss2sd %xmm5, %xmm5 +; SSE-NEXT: andps %xmm8, %xmm5 +; SSE-NEXT: orps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: andps %xmm6, %xmm3 +; SSE-NEXT: cvtss2sd %xmm4, %xmm4 +; SSE-NEXT: andps %xmm8, %xmm4 +; SSE-NEXT: orps %xmm3, %xmm4 +; SSE-NEXT: andps %xmm6, %xmm1 +; SSE-NEXT: cvtss2sd %xmm2, %xmm2 +; SSE-NEXT: andps %xmm8, %xmm2 +; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_fcopysign_fpext_sgn: @@ -245,36 +243,37 @@ define <4 x float> @combine_vec_fcopysign_fptrunc_sgn(<4 x float> %x, <4 x double> %y) { ; SSE-LABEL: combine_vec_fcopysign_fptrunc_sgn: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm5 -; SSE-NEXT: andps %xmm5, %xmm0 -; SSE-NEXT: cvtsd2ss %xmm1, %xmm6 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [NaN,NaN,NaN,NaN] +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: andps %xmm5, %xmm6 +; SSE-NEXT: cvtsd2ss %xmm1, %xmm3 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; SSE-NEXT: andps %xmm4, %xmm6 -; SSE-NEXT: orps %xmm6, %xmm0 -; SSE-NEXT: movshdup {{.*#+}} xmm6 = xmm3[1,1,3,3] +; SSE-NEXT: andps %xmm4, %xmm3 +; SSE-NEXT: orps %xmm6, %xmm3 +; SSE-NEXT: movshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] ; SSE-NEXT: andps %xmm5, %xmm6 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: cvtsd2ss %xmm1, %xmm1 ; SSE-NEXT: andps %xmm4, %xmm1 ; SSE-NEXT: orps %xmm6, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: andps %xmm5, %xmm1 ; SSE-NEXT: xorps %xmm6, %xmm6 ; SSE-NEXT: cvtsd2ss %xmm2, %xmm6 ; SSE-NEXT: andps %xmm4, %xmm6 ; SSE-NEXT: orps %xmm1, %xmm6 -; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] -; SSE-NEXT: andps %xmm5, %xmm3 +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],xmm6[0],xmm3[3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-NEXT: andps %xmm5, %xmm0 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsd2ss %xmm2, %xmm1 ; SSE-NEXT: andps %xmm4, %xmm1 -; SSE-NEXT: orps %xmm3, %xmm1 -; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE-NEXT: orps %xmm0, %xmm1 +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[0] +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_fcopysign_fptrunc_sgn: Index: test/CodeGen/X86/combine-mul.ll =================================================================== --- test/CodeGen/X86/combine-mul.ll +++ test/CodeGen/X86/combine-mul.ll @@ -82,12 +82,11 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psllq $1, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psllq $4, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psllq $4, %xmm2 ; SSE-NEXT: psllq $2, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_mul_pow2c: Index: test/CodeGen/X86/combine-sdiv.ll =================================================================== --- test/CodeGen/X86/combine-sdiv.ll +++ test/CodeGen/X86/combine-sdiv.ll @@ -201,15 +201,15 @@ ; SSE41-LABEL: combine_vec_sdiv_by_pos1: ; SSE41: # %bb.0: ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $4, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrld $2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrld $3, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: psrld $4, %xmm0 -; SSE41-NEXT: psrld $2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pos1: @@ -246,9 +246,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrad $31, %xmm1 ; SSE-NEXT: psrld $30, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: psrad $2, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: psrad $2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_sdiv_by_pow2a: @@ -489,8 +488,7 @@ ; SSE41-NEXT: psraw $1, %xmm2 ; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16: @@ -611,25 +609,23 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psraw $15, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,4,2,16,8,32,64,2] -; SSE41-NEXT: pmulhuw %xmm4, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1,4,2,16,8,32,64,2] +; SSE41-NEXT: pmulhuw %xmm3, %xmm2 ; SSE41-NEXT: paddw %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pmulhw %xmm5, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = +; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: pmulhw %xmm4, %xmm5 ; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6],xmm2[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psraw $15, %xmm3 -; SSE41-NEXT: pmulhuw %xmm4, %xmm3 -; SSE41-NEXT: paddw %xmm1, %xmm3 -; SSE41-NEXT: pmulhw %xmm3, %xmm5 -; SSE41-NEXT: psraw $1, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $15, %xmm2 +; SSE41-NEXT: pmulhuw %xmm3, %xmm2 +; SSE41-NEXT: paddw %xmm1, %xmm2 +; SSE41-NEXT: pmulhw %xmm2, %xmm4 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5,6],xmm2[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16: @@ -827,46 +823,43 @@ ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psraw $15, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [1,4,2,16,8,32,64,2] -; SSE41-NEXT: pmulhuw %xmm7, %xmm0 -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pmulhw %xmm6, %xmm5 -; SSE41-NEXT: psraw $1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3,4,5,6],xmm0[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: psraw $15, %xmm1 -; SSE41-NEXT: pmulhuw %xmm7, %xmm1 -; SSE41-NEXT: paddw %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pmulhw %xmm6, %xmm5 -; SSE41-NEXT: psraw $1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4,5,6],xmm1[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psraw $15, %xmm4 -; SSE41-NEXT: pmulhuw %xmm7, %xmm4 -; SSE41-NEXT: paddw %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pmulhw %xmm6, %xmm5 -; SSE41-NEXT: psraw $1, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm3, %xmm5 -; SSE41-NEXT: psraw $15, %xmm5 -; SSE41-NEXT: pmulhuw %xmm7, %xmm5 -; SSE41-NEXT: paddw %xmm3, %xmm5 -; SSE41-NEXT: pmulhw %xmm5, %xmm6 -; SSE41-NEXT: psraw $1, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: psraw $15, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1,4,2,16,8,32,64,2] +; SSE41-NEXT: pmulhuw %xmm5, %xmm6 +; SSE41-NEXT: paddw %xmm0, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pmulhw %xmm4, %xmm7 +; SSE41-NEXT: psraw $1, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: psraw $15, %xmm6 +; SSE41-NEXT: pmulhuw %xmm5, %xmm6 +; SSE41-NEXT: paddw %xmm1, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pmulhw %xmm4, %xmm7 +; SSE41-NEXT: psraw $1, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: psraw $15, %xmm6 +; SSE41-NEXT: pmulhuw %xmm5, %xmm6 +; SSE41-NEXT: paddw %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pmulhw %xmm4, %xmm7 +; SSE41-NEXT: psraw $1, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: psraw $15, %xmm6 +; SSE41-NEXT: pmulhuw %xmm5, %xmm6 +; SSE41-NEXT: paddw %xmm3, %xmm6 +; SSE41-NEXT: pmulhw %xmm6, %xmm4 +; SSE41-NEXT: psraw $1, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1],xmm6[2],xmm4[3,4,5,6],xmm6[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16: @@ -1047,8 +1040,7 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] ; SSE41-NEXT: psrad $3, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32: @@ -1134,23 +1126,23 @@ ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrld $28, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: psrld $30, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] -; SSE41-NEXT: paddd %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrld $29, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; SSE41-NEXT: paddd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrad $4, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: psrad $2, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: psrad $3, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm3 @@ -1168,8 +1160,7 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; SSE41-NEXT: psrad $3, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32: @@ -1316,43 +1307,42 @@ ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: psrld $28, %xmm5 -; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: psrld $30, %xmm6 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psrld $29, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] +; SSE41-NEXT: paddd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: psrad $4, %xmm5 -; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: psrad $2, %xmm6 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: psrad $3, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: psrad $3, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: psrld $28, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: psrld $30, %xmm6 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] -; SSE41-NEXT: paddd %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: psrld $29, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] +; SSE41-NEXT: paddd %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: psrad $4, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: psrad $2, %xmm6 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: psrad $3, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: psrad $31, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm5 @@ -1370,27 +1360,25 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: psrad $3, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm3, %xmm5 -; SSE41-NEXT: psrad $31, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: psrld $28, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: psrld $28, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: psrld $30, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] -; SSE41-NEXT: paddd %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: psrad $4, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: psrld $29, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] +; SSE41-NEXT: paddd %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: psrad $4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: psrad $2, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: psrad $3, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32: @@ -1553,8 +1541,7 @@ ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952] ; SSE41-NEXT: pxor %xmm2, %xmm1 ; SSE41-NEXT: psubq %xmm2, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64: @@ -1682,8 +1669,7 @@ ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952] ; SSE41-NEXT: pxor %xmm3, %xmm2 ; SSE41-NEXT: psubq %xmm3, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64: @@ -1859,20 +1845,18 @@ ; SSE41-NEXT: psrlq $62, %xmm4 ; SSE41-NEXT: paddq %xmm0, %xmm4 ; SSE41-NEXT: psrlq $2, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,2305843009213693952] -; SSE41-NEXT: pxor %xmm6, %xmm4 -; SSE41-NEXT: psubq %xmm6, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm4[4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: psrad $31, %xmm5 -; SSE41-NEXT: psrlq $62, %xmm5 -; SSE41-NEXT: paddq %xmm2, %xmm5 -; SSE41-NEXT: psrlq $2, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm5 -; SSE41-NEXT: psubq %xmm6, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,2305843009213693952] +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: psubq %xmm5, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: psrlq $62, %xmm4 +; SSE41-NEXT: paddq %xmm2, %xmm4 +; SSE41-NEXT: psrlq $2, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: psubq %xmm5, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64: @@ -2054,8 +2038,7 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psubd %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: @@ -2402,8 +2385,7 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psubd %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: non_splat_minus_one_divisor_2: @@ -2450,8 +2432,7 @@ ; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrlw $15, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_sdiv_nonuniform: Index: test/CodeGen/X86/combine-sra.ll =================================================================== --- test/CodeGen/X86/combine-sra.ll +++ test/CodeGen/X86/combine-sra.ll @@ -195,15 +195,15 @@ ; SSE-LABEL: combine_vec_ashr_trunc_lshr: ; SSE: # %bb.0: ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: psrad $3, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: psrad $1, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: psrad $2, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE-NEXT: psrad $3, %xmm0 -; SSE-NEXT: psrad $1, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; SSE-NEXT: retq ; ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr: @@ -235,15 +235,15 @@ ; SSE-LABEL: combine_vec_ashr_trunc_ashr: ; SSE: # %bb.0: ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: psrad $3, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: psrad $1, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: psrad $2, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE-NEXT: psrad $3, %xmm0 -; SSE-NEXT: psrad $1, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; SSE-NEXT: retq ; ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr: Index: test/CodeGen/X86/combine-srem.ll =================================================================== --- test/CodeGen/X86/combine-srem.ll +++ test/CodeGen/X86/combine-srem.ll @@ -55,13 +55,12 @@ define i32 @combine_srem_by_minsigned(i32 %x) { ; CHECK-LABEL: combine_srem_by_minsigned: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: sarl $31, %eax ; CHECK-NEXT: shrl %eax ; CHECK-NEXT: addl %edi, %eax ; CHECK-NEXT: andl $-2147483648, %eax # imm = 0x80000000 -; CHECK-NEXT: leal (%rax,%rdi), %eax +; CHECK-NEXT: addl %edi, %eax ; CHECK-NEXT: retq %1 = srem i32 %x, -2147483648 ret i32 %1 @@ -334,8 +333,7 @@ ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: combine_vec_srem_by_pow2b_neg: Index: test/CodeGen/X86/combine-srl.ll =================================================================== --- test/CodeGen/X86/combine-srl.ll +++ test/CodeGen/X86/combine-srl.ll @@ -337,29 +337,28 @@ ; SSE-LABEL: combine_vec_lshr_lzcnt_bit1: ; SSE: # %bb.0: ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pshufb %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrlw $4, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pshufb %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrlw $4, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pshufb %xmm1, %xmm2 +; SSE-NEXT: pshufb %xmm3, %xmm1 +; SSE-NEXT: pcmpeqb %xmm4, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: paddb %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE-NEXT: psrlw $8, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: psrlw $8, %xmm1 -; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: psrlw $8, %xmm3 +; SSE-NEXT: paddw %xmm1, %xmm3 ; SSE-NEXT: pcmpeqw %xmm4, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: psrld $5, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: paddd %xmm3, %xmm0 +; SSE-NEXT: psrld $5, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_lshr_lzcnt_bit1: Index: test/CodeGen/X86/combine-udiv.ll =================================================================== --- test/CodeGen/X86/combine-udiv.ll +++ test/CodeGen/X86/combine-udiv.ll @@ -189,15 +189,15 @@ ; ; SSE41-LABEL: combine_vec_udiv_by_pow2b: ; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $4, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrld $2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrld $3, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: psrld $4, %xmm0 -; SSE41-NEXT: psrld $2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_udiv_by_pow2b: @@ -568,11 +568,11 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) { ; SSE2-LABEL: combine_vec_udiv_nonuniform2: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1 @@ -583,10 +583,9 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_vec_udiv_nonuniform2: @@ -734,38 +733,37 @@ define <8 x i16> @pr38477(<8 x i16> %a0) { ; SSE2-LABEL: pr38477: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4957,57457,4103,16385,35545,2048,2115] -; SSE2-NEXT: pmulhuw %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psubw %xmm2, %xmm1 -; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1 -; SSE2-NEXT: paddw %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4957,57457,4103,16385,35545,2048,2115] +; SSE2-NEXT: pmulhuw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psubw %xmm1, %xmm2 +; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm2 +; SSE2-NEXT: paddw %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,0,65535] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: pr38477: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4957,57457,4103,16385,35545,2048,2115] -; SSE41-NEXT: pmulhuw %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psubw %xmm2, %xmm1 -; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm1 -; SSE41-NEXT: paddw %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = -; SSE41-NEXT: pmulhuw %xmm1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5],xmm1[6],xmm2[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4957,57457,4103,16385,35545,2048,2115] +; SSE41-NEXT: pmulhuw %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psubw %xmm1, %xmm2 +; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm2 +; SSE41-NEXT: paddw %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = +; SSE41-NEXT: pmulhuw %xmm2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: pr38477: Index: test/CodeGen/X86/fshl.ll =================================================================== --- test/CodeGen/X86/fshl.ll +++ test/CodeGen/X86/fshl.ll @@ -162,14 +162,13 @@ ; ; X64-SLOW-LABEL: var_shift_i32: ; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movl %esi, %eax -; X64-SLOW-NEXT: movl %edi, %esi +; X64-SLOW-NEXT: movl %edi, %eax ; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: shll %cl, %esi +; X64-SLOW-NEXT: shll %cl, %eax ; X64-SLOW-NEXT: andb $31, %dl ; X64-SLOW-NEXT: movl %edx, %ecx ; X64-SLOW-NEXT: negb %cl -; X64-SLOW-NEXT: shrl %cl, %eax +; X64-SLOW-NEXT: shrl %cl, %esi ; X64-SLOW-NEXT: orl %esi, %eax ; X64-SLOW-NEXT: testb %dl, %dl ; X64-SLOW-NEXT: cmovel %edi, %eax @@ -349,14 +348,13 @@ ; ; X64-SLOW-LABEL: var_shift_i64: ; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movq %rsi, %rax -; X64-SLOW-NEXT: movq %rdi, %rsi +; X64-SLOW-NEXT: movq %rdi, %rax ; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: shlq %cl, %rsi +; X64-SLOW-NEXT: shlq %cl, %rax ; X64-SLOW-NEXT: andb $63, %dl ; X64-SLOW-NEXT: movl %edx, %ecx ; X64-SLOW-NEXT: negb %cl -; X64-SLOW-NEXT: shrq %cl, %rax +; X64-SLOW-NEXT: shrq %cl, %rsi ; X64-SLOW-NEXT: orq %rsi, %rax ; X64-SLOW-NEXT: testb %dl, %dl ; X64-SLOW-NEXT: cmoveq %rdi, %rax Index: test/CodeGen/X86/fshr.ll =================================================================== --- test/CodeGen/X86/fshr.ll +++ test/CodeGen/X86/fshr.ll @@ -161,14 +161,13 @@ ; ; X64-SLOW-LABEL: var_shift_i32: ; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movl %edi, %eax -; X64-SLOW-NEXT: movl %esi, %edi +; X64-SLOW-NEXT: movl %esi, %eax ; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: shrl %cl, %edi +; X64-SLOW-NEXT: shrl %cl, %eax ; X64-SLOW-NEXT: andb $31, %dl ; X64-SLOW-NEXT: movl %edx, %ecx ; X64-SLOW-NEXT: negb %cl -; X64-SLOW-NEXT: shll %cl, %eax +; X64-SLOW-NEXT: shll %cl, %edi ; X64-SLOW-NEXT: orl %edi, %eax ; X64-SLOW-NEXT: testb %dl, %dl ; X64-SLOW-NEXT: cmovel %esi, %eax @@ -346,14 +345,13 @@ ; ; X64-SLOW-LABEL: var_shift_i64: ; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movq %rdi, %rax -; X64-SLOW-NEXT: movq %rsi, %rdi +; X64-SLOW-NEXT: movq %rsi, %rax ; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: shrq %cl, %rdi +; X64-SLOW-NEXT: shrq %cl, %rax ; X64-SLOW-NEXT: andb $63, %dl ; X64-SLOW-NEXT: movl %edx, %ecx ; X64-SLOW-NEXT: negb %cl -; X64-SLOW-NEXT: shlq %cl, %rax +; X64-SLOW-NEXT: shlq %cl, %rdi ; X64-SLOW-NEXT: orq %rdi, %rax ; X64-SLOW-NEXT: testb %dl, %dl ; X64-SLOW-NEXT: cmoveq %rsi, %rax Index: test/CodeGen/X86/funnel-shift.ll =================================================================== --- test/CodeGen/X86/funnel-shift.ll +++ test/CodeGen/X86/funnel-shift.ll @@ -111,10 +111,9 @@ ; X64-AVX2-NEXT: subl %r8d, %ecx ; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-AVX2-NEXT: shrq %cl, %rsi -; X64-AVX2-NEXT: orq %rax, %rsi +; X64-AVX2-NEXT: orq %rsi, %rax ; X64-AVX2-NEXT: testq %r8, %r8 -; X64-AVX2-NEXT: cmoveq %rdi, %rsi -; X64-AVX2-NEXT: movq %rsi, %rax +; X64-AVX2-NEXT: cmoveq %rdi, %rax ; X64-AVX2-NEXT: retq %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z) ret i37 %f @@ -297,10 +296,10 @@ ; X64-AVX2-NEXT: subl %r8d, %ecx ; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-AVX2-NEXT: shlq %cl, %rdi -; X64-AVX2-NEXT: orq %r9, %rdi +; X64-AVX2-NEXT: orq %rdi, %r9 ; X64-AVX2-NEXT: testq %r8, %r8 -; X64-AVX2-NEXT: cmoveq %rsi, %rdi -; X64-AVX2-NEXT: movq %rdi, %rax +; X64-AVX2-NEXT: cmoveq %rsi, %r9 +; X64-AVX2-NEXT: movq %r9, %rax ; X64-AVX2-NEXT: retq %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z) ret i37 %f Index: test/CodeGen/X86/haddsub-undef.ll =================================================================== --- test/CodeGen/X86/haddsub-undef.ll +++ test/CodeGen/X86/haddsub-undef.ll @@ -117,8 +117,7 @@ ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-SLOW-NEXT: addsd %xmm0, %xmm1 -; SSE-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: test5_undef: @@ -430,8 +429,7 @@ ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] -; SSE-SLOW-NEXT: addpd %xmm0, %xmm1 -; SSE-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE-SLOW-NEXT: addpd %xmm1, %xmm0 ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: add_pd_003_2: Index: test/CodeGen/X86/haddsub.ll =================================================================== --- test/CodeGen/X86/haddsub.ll +++ test/CodeGen/X86/haddsub.ll @@ -43,8 +43,7 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addpd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addpd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: haddpd3: @@ -648,8 +647,7 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: extract_extract_v2f64_fadd_f64: @@ -678,8 +676,7 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: extract_extract_v2f64_fadd_f64_commute: @@ -866,8 +863,7 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: extract_extract_v4f64_fadd_f64: @@ -898,8 +894,7 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: extract_extract_v4f64_fadd_f64_commute: @@ -1098,8 +1093,7 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: extract_extract_v8f64_fadd_f64: @@ -1130,8 +1124,7 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: extract_extract_v8f64_fadd_f64_commute: Index: test/CodeGen/X86/imul.ll =================================================================== --- test/CodeGen/X86/imul.ll +++ test/CodeGen/X86/imul.ll @@ -215,10 +215,9 @@ define i32 @mul33_32(i32 %A) { ; X64-LABEL: mul33_32: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $5, %eax -; X64-NEXT: leal (%rax,%rdi), %eax +; X64-NEXT: addl %edi, %eax ; X64-NEXT: retq ; ; X86-LABEL: mul33_32: @@ -344,10 +343,9 @@ define i32 @test2(i32 %a) { ; X64-LABEL: test2: ; X64: # %bb.0: # %entry -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $5, %eax -; X64-NEXT: leal (%rax,%rdi), %eax +; X64-NEXT: addl %edi, %eax ; X64-NEXT: retq ; ; X86-LABEL: test2: @@ -365,10 +363,9 @@ define i32 @test3(i32 %a) { ; X64-LABEL: test3: ; X64: # %bb.0: # %entry -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $5, %eax -; X64-NEXT: leal (%rax,%rdi), %eax +; X64-NEXT: addl %edi, %eax ; X64-NEXT: negl %eax ; X64-NEXT: retq ; @@ -446,7 +443,7 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: shlq $5, %rax -; X64-NEXT: leaq (%rax,%rdi), %rax +; X64-NEXT: addq %rdi, %rax ; X64-NEXT: retq ; ; X86-LABEL: test6: @@ -469,7 +466,7 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: shlq $5, %rax -; X64-NEXT: leaq (%rax,%rdi), %rax +; X64-NEXT: addq %rdi, %rax ; X64-NEXT: negq %rax ; X64-NEXT: retq ; Index: test/CodeGen/X86/mul-constant-i16.ll =================================================================== --- test/CodeGen/X86/mul-constant-i16.ll +++ test/CodeGen/X86/mul-constant-i16.ll @@ -318,10 +318,9 @@ ; ; X64-LABEL: test_mul_by_17: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $4, %eax -; X64-NEXT: leal (%rax,%rdi), %eax +; X64-NEXT: addl %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %mul = mul nsw i16 %x, 17 Index: test/CodeGen/X86/mul-constant-i32.ll =================================================================== --- test/CodeGen/X86/mul-constant-i32.ll +++ test/CodeGen/X86/mul-constant-i32.ll @@ -841,18 +841,16 @@ ; ; X64-HSW-LABEL: test_mul_by_17: ; X64-HSW: # %bb.0: -; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi ; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] ; X64-HSW-NEXT: shll $4, %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50] +; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] ; X64-HSW-NEXT: retq # sched: [7:1.00] ; ; X64-JAG-LABEL: test_mul_by_17: ; X64-JAG: # %bb.0: -; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi ; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50] ; X64-JAG-NEXT: shll $4, %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50] +; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] ; X64-JAG-NEXT: retq # sched: [4:1.00] ; ; X86-NOOPT-LABEL: test_mul_by_17: @@ -872,10 +870,9 @@ ; ; X64-SLM-LABEL: test_mul_by_17: ; X64-SLM: # %bb.0: -; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi ; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] ; X64-SLM-NEXT: shll $4, %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rdi), %eax # sched: [1:1.00] +; X64-SLM-NEXT: addl %edi, %eax # sched: [1:0.50] ; X64-SLM-NEXT: retq # sched: [4:1.00] ; ; SLM-NOOPT-LABEL: test_mul_by_17: @@ -1875,10 +1872,9 @@ ; ; X64-SLM-LABEL: test_mul_by_66: ; X64-SLM: # %bb.0: -; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi ; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] ; X64-SLM-NEXT: shll $6, %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rdi), %eax # sched: [1:1.00] +; X64-SLM-NEXT: addl %edi, %eax # sched: [1:0.50] ; X64-SLM-NEXT: addl %edi, %eax # sched: [1:0.50] ; X64-SLM-NEXT: retq # sched: [4:1.00] ; Index: test/CodeGen/X86/mul-constant-i64.ll =================================================================== --- test/CodeGen/X86/mul-constant-i64.ll +++ test/CodeGen/X86/mul-constant-i64.ll @@ -872,14 +872,14 @@ ; X64-HSW: # %bb.0: ; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] ; X64-HSW-NEXT: shlq $4, %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] +; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] ; X64-HSW-NEXT: retq # sched: [7:1.00] ; ; X64-JAG-LABEL: test_mul_by_17: ; X64-JAG: # %bb.0: ; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.50] ; X64-JAG-NEXT: shlq $4, %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] +; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] ; X64-JAG-NEXT: retq # sched: [4:1.00] ; ; X86-NOOPT-LABEL: test_mul_by_17: Index: test/CodeGen/X86/mul-constant-i8.ll =================================================================== --- test/CodeGen/X86/mul-constant-i8.ll +++ test/CodeGen/X86/mul-constant-i8.ll @@ -188,10 +188,9 @@ define i8 @test_mul_by_17(i8 %x) { ; X64-LABEL: test_mul_by_17: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $4, %eax -; X64-NEXT: leal (%rax,%rdi), %eax +; X64-NEXT: addl %edi, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 17 Index: test/CodeGen/X86/palignr.ll =================================================================== --- test/CodeGen/X86/palignr.ll +++ test/CodeGen/X86/palignr.ll @@ -167,8 +167,8 @@ ; CHECK-SSE2-LABEL: test9: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; CHECK-SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; CHECK-SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; CHECK-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] ; CHECK-SSE2-NEXT: por %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retl ; Index: test/CodeGen/X86/phaddsub.ll =================================================================== --- test/CodeGen/X86/phaddsub.ll +++ test/CodeGen/X86/phaddsub.ll @@ -697,8 +697,7 @@ ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: pslld $16, %xmm1 -; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: phaddw_single_source4: Index: test/CodeGen/X86/pmul.ll =================================================================== --- test/CodeGen/X86/pmul.ll +++ test/CodeGen/X86/pmul.ll @@ -1017,11 +1017,11 @@ ; SSE2-LABEL: mul_v4i64_zero_upper: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 @@ -1171,27 +1171,27 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; SSE2-LABEL: mul_v8i64_zero_upper: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE2-NEXT: pmuludq %xmm7, %xmm4 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: pmuludq %xmm6, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE2-NEXT: pmuludq %xmm0, %xmm5 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: pmuludq %xmm7, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] ; SSE2-NEXT: movaps %xmm4, %xmm0 -; SSE2-NEXT: movaps %xmm5, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v8i64_zero_upper: Index: test/CodeGen/X86/popcnt.ll =================================================================== --- test/CodeGen/X86/popcnt.ll +++ test/CodeGen/X86/popcnt.ll @@ -25,7 +25,6 @@ ; ; X64-LABEL: cnt8: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shrb %al ; X64-NEXT: andb $85, %al @@ -37,9 +36,8 @@ ; X64-NEXT: addb %al, %dil ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shrb $4, %al -; X64-NEXT: leal (%rax,%rdi), %eax +; X64-NEXT: addb %dil, %al ; X64-NEXT: andb $15, %al -; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X32-POPCNT-LABEL: cnt8: @@ -225,7 +223,7 @@ ; X64-NEXT: addq %rcx, %rdi ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: shrq $4, %rax -; X64-NEXT: leaq (%rax,%rdi), %rax +; X64-NEXT: addq %rdi, %rax ; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F ; X64-NEXT: andq %rax, %rcx ; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 Index: test/CodeGen/X86/powi.ll =================================================================== --- test/CodeGen/X86/powi.ll +++ test/CodeGen/X86/powi.ll @@ -10,8 +10,7 @@ ; CHECK-NEXT: mulsd %xmm1, %xmm1 ; CHECK-NEXT: mulsd %xmm1, %xmm0 ; CHECK-NEXT: mulsd %xmm1, %xmm1 -; CHECK-NEXT: mulsd %xmm0, %xmm1 -; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: mulsd %xmm1, %xmm0 ; CHECK-NEXT: retq %ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; [#uses=1] ret double %ret Index: test/CodeGen/X86/psubus.ll =================================================================== --- test/CodeGen/X86/psubus.ll +++ test/CodeGen/X86/psubus.ll @@ -1410,24 +1410,25 @@ ; SSE2-LABEL: psubus_8i32_max: ; SSE2: # %bb.0: # %vector.ph ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm6 ; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: psubd %xmm1, %xmm0 ; SSE2-NEXT: psubd %xmm2, %xmm4 @@ -1787,54 +1788,55 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { ; SSE2-LABEL: psubus_16i32_max: ; SSE2: # %bb.0: # %vector.ph -; SSE2-NEXT: movdqa %xmm1, %xmm8 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: por %xmm7, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm9 -; SSE2-NEXT: por %xmm0, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm7 +; SSE2-NEXT: por %xmm0, %xmm7 ; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 ; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm10 ; SSE2-NEXT: pandn %xmm2, %xmm0 ; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pxor %xmm7, %xmm10 -; SSE2-NEXT: movdqa %xmm8, %xmm6 -; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm10 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: por %xmm1, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm10, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm8 +; SSE2-NEXT: pand %xmm6, %xmm9 ; SSE2-NEXT: pandn %xmm5, %xmm6 -; SSE2-NEXT: por %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: pxor %xmm7, %xmm8 -; SSE2-NEXT: por %xmm1, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pandn %xmm4, %xmm7 -; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: por %xmm9, %xmm6 +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: por %xmm8, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm8 +; SSE2-NEXT: pandn %xmm4, %xmm1 +; SSE2-NEXT: por %xmm8, %xmm1 ; SSE2-NEXT: psubd %xmm4, %xmm1 ; SSE2-NEXT: psubd %xmm5, %xmm6 ; SSE2-NEXT: psubd %xmm2, %xmm0 -; SSE2-NEXT: psubd %xmm3, %xmm9 -; SSE2-NEXT: pslld $16, %xmm9 -; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: psubd %xmm3, %xmm7 +; SSE2-NEXT: pslld $16, %xmm7 +; SSE2-NEXT: psrad $16, %xmm7 ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm9, %xmm0 +; SSE2-NEXT: packssdw %xmm7, %xmm0 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 @@ -1844,54 +1846,55 @@ ; ; SSSE3-LABEL: psubus_16i32_max: ; SSSE3: # %bb.0: # %vector.ph -; SSSE3-NEXT: movdqa %xmm1, %xmm8 -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm9 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm8 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] ; SSSE3-NEXT: movdqa %xmm0, %xmm10 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSSE3-NEXT: pxor %xmm7, %xmm6 -; SSSE3-NEXT: movdqa %xmm0, %xmm9 -; SSSE3-NEXT: por %xmm7, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm9 -; SSSE3-NEXT: pand %xmm9, %xmm0 -; SSSE3-NEXT: pandn %xmm3, %xmm9 -; SSSE3-NEXT: por %xmm0, %xmm9 +; SSSE3-NEXT: pxor %xmm1, %xmm6 +; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pandn %xmm3, %xmm7 +; SSSE3-NEXT: por %xmm0, %xmm7 ; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm7, %xmm6 +; SSSE3-NEXT: pxor %xmm1, %xmm6 ; SSSE3-NEXT: movdqa %xmm10, %xmm0 -; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 ; SSSE3-NEXT: pand %xmm0, %xmm10 ; SSSE3-NEXT: pandn %xmm2, %xmm0 ; SSSE3-NEXT: por %xmm10, %xmm0 ; SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSSE3-NEXT: pxor %xmm7, %xmm10 -; SSSE3-NEXT: movdqa %xmm8, %xmm6 -; SSSE3-NEXT: por %xmm7, %xmm6 +; SSSE3-NEXT: pxor %xmm1, %xmm10 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: por %xmm1, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm10, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm8 +; SSSE3-NEXT: pand %xmm6, %xmm9 ; SSSE3-NEXT: pandn %xmm5, %xmm6 -; SSSE3-NEXT: por %xmm8, %xmm6 -; SSSE3-NEXT: movdqa %xmm4, %xmm8 -; SSSE3-NEXT: pxor %xmm7, %xmm8 -; SSSE3-NEXT: por %xmm1, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: pandn %xmm4, %xmm7 -; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: por %xmm9, %xmm6 +; SSSE3-NEXT: movdqa %xmm4, %xmm9 +; SSSE3-NEXT: pxor %xmm1, %xmm9 +; SSSE3-NEXT: por %xmm8, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm8 +; SSSE3-NEXT: pandn %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm8, %xmm1 ; SSSE3-NEXT: psubd %xmm4, %xmm1 ; SSSE3-NEXT: psubd %xmm5, %xmm6 ; SSSE3-NEXT: psubd %xmm2, %xmm0 -; SSSE3-NEXT: psubd %xmm3, %xmm9 -; SSSE3-NEXT: pslld $16, %xmm9 -; SSSE3-NEXT: psrad $16, %xmm9 +; SSSE3-NEXT: psubd %xmm3, %xmm7 +; SSSE3-NEXT: pslld $16, %xmm7 +; SSSE3-NEXT: psrad $16, %xmm7 ; SSSE3-NEXT: pslld $16, %xmm0 ; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: packssdw %xmm9, %xmm0 +; SSSE3-NEXT: packssdw %xmm7, %xmm0 ; SSSE3-NEXT: pslld $16, %xmm6 ; SSSE3-NEXT: psrad $16, %xmm6 ; SSSE3-NEXT: pslld $16, %xmm1 Index: test/CodeGen/X86/rotate-extract.ll =================================================================== --- test/CodeGen/X86/rotate-extract.ll +++ test/CodeGen/X86/rotate-extract.ll @@ -156,7 +156,7 @@ ; X64-NEXT: shlq $5, %rax ; X64-NEXT: shlq $10, %rdi ; X64-NEXT: shrq $57, %rax -; X64-NEXT: leaq (%rax,%rdi), %rax +; X64-NEXT: orq %rdi, %rax ; X64-NEXT: retq %lhs_mul = shl i64 %i, 5 %rhs_mul = shl i64 %i, 10 @@ -179,12 +179,11 @@ ; ; X64-LABEL: no_extract_shrl: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: andl $-8, %eax ; X64-NEXT: shll $25, %eax ; X64-NEXT: shrl $9, %edi -; X64-NEXT: leal (%rdi,%rax), %eax +; X64-NEXT: orl %edi, %eax ; X64-NEXT: retq %lhs_div = lshr i32 %i, 3 %rhs_div = lshr i32 %i, 9 Index: test/CodeGen/X86/sat-add.ll =================================================================== --- test/CodeGen/X86/sat-add.ll +++ test/CodeGen/X86/sat-add.ll @@ -240,12 +240,11 @@ define i16 @unsigned_sat_variable_i16_using_min(i16 %x, i16 %y) { ; ANY-LABEL: unsigned_sat_variable_i16_using_min: ; ANY: # %bb.0: -; ANY-NEXT: # kill: def $esi killed $esi def $rsi ; ANY-NEXT: movl %esi, %eax ; ANY-NEXT: notl %eax ; ANY-NEXT: cmpw %ax, %di ; ANY-NEXT: cmovbl %edi, %eax -; ANY-NEXT: leal (%rax,%rsi), %eax +; ANY-NEXT: addl %esi, %eax ; ANY-NEXT: # kill: def $ax killed $ax killed $eax ; ANY-NEXT: retq %noty = xor i16 %y, -1 @@ -291,12 +290,11 @@ define i32 @unsigned_sat_variable_i32_using_min(i32 %x, i32 %y) { ; ANY-LABEL: unsigned_sat_variable_i32_using_min: ; ANY: # %bb.0: -; ANY-NEXT: # kill: def $esi killed $esi def $rsi ; ANY-NEXT: movl %esi, %eax ; ANY-NEXT: notl %eax ; ANY-NEXT: cmpl %eax, %edi ; ANY-NEXT: cmovbl %edi, %eax -; ANY-NEXT: leal (%rax,%rsi), %eax +; ANY-NEXT: addl %esi, %eax ; ANY-NEXT: retq %noty = xor i32 %y, -1 %c = icmp ult i32 %x, %noty @@ -343,7 +341,7 @@ ; ANY-NEXT: notq %rax ; ANY-NEXT: cmpq %rax, %rdi ; ANY-NEXT: cmovbq %rdi, %rax -; ANY-NEXT: leaq (%rax,%rsi), %rax +; ANY-NEXT: addq %rsi, %rax ; ANY-NEXT: retq %noty = xor i64 %y, -1 %c = icmp ult i64 %x, %noty @@ -496,15 +494,14 @@ ; ; SSE41-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [42,42,42,42] -; SSE41-NEXT: paddd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pminud %xmm2, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42,42,42,42] +; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pminud %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, %a @@ -786,11 +783,10 @@ ; SSE41-NEXT: paddd %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pminud %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm2 -; SSE41-NEXT: por %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq %a = add <4 x i32> %x, %y %c = icmp ugt <4 x i32> %x, %a Index: test/CodeGen/X86/sqrt-fastmath.ll =================================================================== --- test/CodeGen/X86/sqrt-fastmath.ll +++ test/CodeGen/X86/sqrt-fastmath.ll @@ -175,17 +175,17 @@ define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 { ; SSE-LABEL: sqrt_v4f32_check_denorms: ; SSE: # %bb.0: -; SSE-NEXT: rsqrtps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm2, %xmm1 +; SSE-NEXT: rsqrtps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; SSE-NEXT: mulps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: addps {{.*}}(%rip), %xmm1 -; SSE-NEXT: mulps %xmm3, %xmm1 +; SSE-NEXT: mulps %xmm2, %xmm3 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps {{.*}}(%rip), %xmm2 +; SSE-NEXT: mulps %xmm3, %xmm2 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] -; SSE-NEXT: cmpleps %xmm0, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; SSE-NEXT: cmpleps %xmm0, %xmm1 ; SSE-NEXT: andps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq Index: test/CodeGen/X86/sse-minmax.ll =================================================================== --- test/CodeGen/X86/sse-minmax.ll +++ test/CodeGen/X86/sse-minmax.ll @@ -103,8 +103,7 @@ ; STRICT-NEXT: cmplesd %xmm1, %xmm2 ; STRICT-NEXT: andpd %xmm2, %xmm0 ; STRICT-NEXT: andnpd %xmm1, %xmm2 -; STRICT-NEXT: orpd %xmm0, %xmm2 -; STRICT-NEXT: movapd %xmm2, %xmm0 +; STRICT-NEXT: orpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ole: @@ -261,11 +260,10 @@ define double @ole_x(double %x) { ; STRICT-LABEL: ole_x: ; STRICT: # %bb.0: -; STRICT-NEXT: xorpd %xmm2, %xmm2 -; STRICT-NEXT: movapd %xmm0, %xmm1 -; STRICT-NEXT: cmplesd %xmm2, %xmm1 -; STRICT-NEXT: andpd %xmm0, %xmm1 -; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: xorpd %xmm1, %xmm1 +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: cmplesd %xmm1, %xmm2 +; STRICT-NEXT: andpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ole_x: @@ -338,8 +336,7 @@ ; STRICT-NEXT: cmpnlesd %xmm1, %xmm2 ; STRICT-NEXT: andpd %xmm2, %xmm0 ; STRICT-NEXT: andnpd %xmm1, %xmm2 -; STRICT-NEXT: orpd %xmm0, %xmm2 -; STRICT-NEXT: movapd %xmm2, %xmm0 +; STRICT-NEXT: orpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ugt: @@ -499,11 +496,10 @@ define double @ugt_x(double %x) { ; STRICT-LABEL: ugt_x: ; STRICT: # %bb.0: -; STRICT-NEXT: xorpd %xmm2, %xmm2 -; STRICT-NEXT: movapd %xmm0, %xmm1 -; STRICT-NEXT: cmpnlesd %xmm2, %xmm1 -; STRICT-NEXT: andpd %xmm0, %xmm1 -; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: xorpd %xmm1, %xmm1 +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: cmpnlesd %xmm1, %xmm2 +; STRICT-NEXT: andpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ugt_x: @@ -762,13 +758,12 @@ define double @ole_y(double %x) { ; STRICT-LABEL: ole_y: ; STRICT: # %bb.0: -; STRICT-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; STRICT-NEXT: movapd %xmm0, %xmm1 -; STRICT-NEXT: cmplesd %xmm2, %xmm1 -; STRICT-NEXT: andpd %xmm1, %xmm0 -; STRICT-NEXT: andnpd %xmm2, %xmm1 -; STRICT-NEXT: orpd %xmm0, %xmm1 -; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: cmplesd %xmm1, %xmm2 +; STRICT-NEXT: andpd %xmm2, %xmm0 +; STRICT-NEXT: andnpd %xmm1, %xmm2 +; STRICT-NEXT: orpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ole_y: @@ -839,13 +834,12 @@ define double @ugt_y(double %x) { ; STRICT-LABEL: ugt_y: ; STRICT: # %bb.0: -; STRICT-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; STRICT-NEXT: movapd %xmm0, %xmm1 -; STRICT-NEXT: cmpnlesd %xmm2, %xmm1 -; STRICT-NEXT: andpd %xmm1, %xmm0 -; STRICT-NEXT: andnpd %xmm2, %xmm1 -; STRICT-NEXT: orpd %xmm0, %xmm1 -; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: cmpnlesd %xmm1, %xmm2 +; STRICT-NEXT: andpd %xmm2, %xmm0 +; STRICT-NEXT: andnpd %xmm1, %xmm2 +; STRICT-NEXT: orpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ugt_y: Index: test/CodeGen/X86/sse2-intrinsics-canonical.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-canonical.ll +++ test/CodeGen/X86/sse2-intrinsics-canonical.ll @@ -236,9 +236,8 @@ ; SSE-NEXT: pcmpgtd %xmm3, %xmm2 ## encoding: [0x66,0x0f,0x66,0xd3] ; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] ; SSE-NEXT: pandn %xmm3, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd3] -; SSE-NEXT: por %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xeb,0xd0] -; SSE-NEXT: psubd %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xfa,0xd1] -; SSE-NEXT: movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2] +; SSE-NEXT: por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2] +; SSE-NEXT: psubd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xfa,0xc1] ; SSE-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_x86_sse2_psubus_w_64: Index: test/CodeGen/X86/sse2-schedule.ll =================================================================== --- test/CodeGen/X86/sse2-schedule.ll +++ test/CodeGen/X86/sse2-schedule.ll @@ -8586,8 +8586,7 @@ ; GENERIC-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: pandn (%rdi), %xmm1 # sched: [7:0.50] -; GENERIC-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] -; GENERIC-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; ATOM-LABEL: test_pandn: @@ -8595,8 +8594,7 @@ ; ATOM-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.50] ; ATOM-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.50] ; ATOM-NEXT: pandn (%rdi), %xmm1 # sched: [1:1.00] -; ATOM-NEXT: paddq %xmm0, %xmm1 # sched: [2:1.00] -; ATOM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; ATOM-NEXT: paddq %xmm1, %xmm0 # sched: [2:1.00] ; ATOM-NEXT: retq # sched: [79:39.50] ; ; SLM-LABEL: test_pandn: @@ -8604,8 +8602,7 @@ ; SLM-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.50] ; SLM-NEXT: pandn (%rdi), %xmm1 # sched: [4:1.00] -; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] -; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-SSE-LABEL: test_pandn: @@ -8613,8 +8610,7 @@ ; SANDY-SSE-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.33] ; SANDY-SSE-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.33] ; SANDY-SSE-NEXT: pandn (%rdi), %xmm1 # sched: [7:0.50] -; SANDY-SSE-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] -; SANDY-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] ; SANDY-SSE-NEXT: retq # sched: [1:1.00] ; ; SANDY-LABEL: test_pandn: @@ -8629,8 +8625,7 @@ ; HASWELL-SSE-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.33] ; HASWELL-SSE-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.33] ; HASWELL-SSE-NEXT: pandn (%rdi), %xmm1 # sched: [7:0.50] -; HASWELL-SSE-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] -; HASWELL-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] ; HASWELL-SSE-NEXT: retq # sched: [7:1.00] ; ; HASWELL-LABEL: test_pandn: @@ -8645,8 +8640,7 @@ ; BROADWELL-SSE-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.33] ; BROADWELL-SSE-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.33] ; BROADWELL-SSE-NEXT: pandn (%rdi), %xmm1 # sched: [6:0.50] -; BROADWELL-SSE-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] -; BROADWELL-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.33] +; BROADWELL-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] ; BROADWELL-SSE-NEXT: retq # sched: [7:1.00] ; ; BROADWELL-LABEL: test_pandn: @@ -8661,8 +8655,7 @@ ; SKYLAKE-SSE-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.33] ; SKYLAKE-SSE-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.33] ; SKYLAKE-SSE-NEXT: pandn (%rdi), %xmm1 # sched: [7:0.50] -; SKYLAKE-SSE-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.33] -; SKYLAKE-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.33] +; SKYLAKE-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.33] ; SKYLAKE-SSE-NEXT: retq # sched: [7:1.00] ; ; SKYLAKE-LABEL: test_pandn: @@ -8677,8 +8670,7 @@ ; SKX-SSE-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.33] ; SKX-SSE-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.33] ; SKX-SSE-NEXT: pandn (%rdi), %xmm1 # sched: [7:0.50] -; SKX-SSE-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.33] -; SKX-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.33] +; SKX-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.33] ; SKX-SSE-NEXT: retq # sched: [7:1.00] ; ; SKX-LABEL: test_pandn: @@ -8693,8 +8685,7 @@ ; BDVER2-SSE-NEXT: pandn %xmm1, %xmm0 # sched: [2:0.50] ; BDVER2-SSE-NEXT: movdqa %xmm0, %xmm1 # sched: [2:0.50] ; BDVER2-SSE-NEXT: pandn (%rdi), %xmm1 # sched: [7:0.50] -; BDVER2-SSE-NEXT: paddq %xmm0, %xmm1 # sched: [2:0.50] -; BDVER2-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [2:0.50] +; BDVER2-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [2:0.50] ; BDVER2-SSE-NEXT: retq # sched: [5:1.00] ; ; BDVER2-LABEL: test_pandn: @@ -8709,8 +8700,7 @@ ; BTVER2-SSE-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-SSE-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.50] ; BTVER2-SSE-NEXT: pandn (%rdi), %xmm1 # sched: [6:1.00] -; BTVER2-SSE-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] -; BTVER2-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-SSE-NEXT: retq # sched: [4:1.00] ; ; BTVER2-LABEL: test_pandn: @@ -8725,8 +8715,7 @@ ; ZNVER1-SSE-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.25] ; ZNVER1-SSE-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.25] ; ZNVER1-SSE-NEXT: pandn (%rdi), %xmm1 # sched: [8:0.50] -; ZNVER1-SSE-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.25] -; ZNVER1-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.25] ; ZNVER1-SSE-NEXT: retq # sched: [1:0.50] ; ; ZNVER1-LABEL: test_pandn: Index: test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll =================================================================== --- test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll +++ test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll @@ -40,10 +40,9 @@ ; X64-LABEL: test__blcic_u64: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: xorq $-1, %rcx -; X64-NEXT: addq $1, %rax -; X64-NEXT: andq %rcx, %rax +; X64-NEXT: xorq $-1, %rax +; X64-NEXT: addq $1, %rdi +; X64-NEXT: andq %rdi, %rax ; X64-NEXT: retq %1 = xor i64 %a0, -1 %2 = add i64 %a0, 1 @@ -89,10 +88,9 @@ ; X64-LABEL: test__blsic_u64: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: xorq $-1, %rcx -; X64-NEXT: subq $1, %rax -; X64-NEXT: orq %rcx, %rax +; X64-NEXT: xorq $-1, %rax +; X64-NEXT: subq $1, %rdi +; X64-NEXT: orq %rdi, %rax ; X64-NEXT: retq %1 = xor i64 %a0, -1 %2 = sub i64 %a0, 1 @@ -104,10 +102,9 @@ ; X64-LABEL: test__t1mskc_u64: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: xorq $-1, %rcx -; X64-NEXT: addq $1, %rax -; X64-NEXT: orq %rcx, %rax +; X64-NEXT: xorq $-1, %rax +; X64-NEXT: addq $1, %rdi +; X64-NEXT: orq %rdi, %rax ; X64-NEXT: retq %1 = xor i64 %a0, -1 %2 = add i64 %a0, 1 @@ -119,10 +116,9 @@ ; X64-LABEL: test__tzmsk_u64: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: xorq $-1, %rcx -; X64-NEXT: subq $1, %rax -; X64-NEXT: andq %rcx, %rax +; X64-NEXT: xorq $-1, %rax +; X64-NEXT: subq $1, %rdi +; X64-NEXT: andq %rdi, %rax ; X64-NEXT: retq %1 = xor i64 %a0, -1 %2 = sub i64 %a0, 1 Index: test/CodeGen/X86/tbm-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/tbm-intrinsics-fast-isel.ll +++ test/CodeGen/X86/tbm-intrinsics-fast-isel.ll @@ -72,10 +72,9 @@ ; X64-LABEL: test__blcic_u32: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: xorl $-1, %ecx -; X64-NEXT: addl $1, %eax -; X64-NEXT: andl %ecx, %eax +; X64-NEXT: xorl $-1, %eax +; X64-NEXT: addl $1, %edi +; X64-NEXT: andl %edi, %eax ; X64-NEXT: retq %1 = xor i32 %a0, -1 %2 = add i32 %a0, 1 @@ -154,10 +153,9 @@ ; X64-LABEL: test__blsic_u32: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: xorl $-1, %ecx -; X64-NEXT: subl $1, %eax -; X64-NEXT: orl %ecx, %eax +; X64-NEXT: xorl $-1, %eax +; X64-NEXT: subl $1, %edi +; X64-NEXT: orl %edi, %eax ; X64-NEXT: retq %1 = xor i32 %a0, -1 %2 = sub i32 %a0, 1 @@ -178,10 +176,9 @@ ; X64-LABEL: test__t1mskc_u32: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: xorl $-1, %ecx -; X64-NEXT: addl $1, %eax -; X64-NEXT: orl %ecx, %eax +; X64-NEXT: xorl $-1, %eax +; X64-NEXT: addl $1, %edi +; X64-NEXT: orl %edi, %eax ; X64-NEXT: retq %1 = xor i32 %a0, -1 %2 = add i32 %a0, 1 @@ -202,10 +199,9 @@ ; X64-LABEL: test__tzmsk_u32: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: xorl $-1, %ecx -; X64-NEXT: subl $1, %eax -; X64-NEXT: andl %ecx, %eax +; X64-NEXT: xorl $-1, %eax +; X64-NEXT: subl $1, %edi +; X64-NEXT: andl %edi, %eax ; X64-NEXT: retq %1 = xor i32 %a0, -1 %2 = sub i32 %a0, 1 Index: test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll =================================================================== --- test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll +++ test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll @@ -545,18 +545,18 @@ define i32 @out_constant_varx_mone(i32 %x, i32 %y, i32 %mask) { ; CHECK-NOBMI-LABEL: out_constant_varx_mone: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %edi, %eax -; CHECK-NOBMI-NEXT: andl %edx, %eax -; CHECK-NOBMI-NEXT: notl %edx -; CHECK-NOBMI-NEXT: orl %edx, %eax +; CHECK-NOBMI-NEXT: andl %edx, %edi +; CHECK-NOBMI-NEXT: movl %edx, %eax +; CHECK-NOBMI-NEXT: notl %eax +; CHECK-NOBMI-NEXT: orl %edi, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_varx_mone: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %edi, %eax -; CHECK-BMI-NEXT: andl %edx, %eax -; CHECK-BMI-NEXT: notl %edx -; CHECK-BMI-NEXT: orl %edx, %eax +; CHECK-BMI-NEXT: andl %edx, %edi +; CHECK-BMI-NEXT: movl %edx, %eax +; CHECK-BMI-NEXT: notl %eax +; CHECK-BMI-NEXT: orl %edi, %eax ; CHECK-BMI-NEXT: retq %notmask = xor i32 %mask, -1 %mx = and i32 %mask, %x @@ -676,11 +676,10 @@ ; CHECK-NOBMI-LABEL: out_constant_varx_42_invmask: ; CHECK-NOBMI: # %bb.0: ; CHECK-NOBMI-NEXT: movl %edx, %eax -; CHECK-NOBMI-NEXT: movl %edx, %ecx -; CHECK-NOBMI-NEXT: notl %ecx -; CHECK-NOBMI-NEXT: andl %edi, %ecx -; CHECK-NOBMI-NEXT: andl $42, %eax -; CHECK-NOBMI-NEXT: orl %ecx, %eax +; CHECK-NOBMI-NEXT: notl %eax +; CHECK-NOBMI-NEXT: andl %edi, %eax +; CHECK-NOBMI-NEXT: andl $42, %edx +; CHECK-NOBMI-NEXT: orl %edx, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_varx_42_invmask: @@ -762,18 +761,18 @@ define i32 @out_constant_mone_vary_invmask(i32 %x, i32 %y, i32 %mask) { ; CHECK-NOBMI-LABEL: out_constant_mone_vary_invmask: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %esi, %eax -; CHECK-NOBMI-NEXT: andl %edx, %eax -; CHECK-NOBMI-NEXT: notl %edx -; CHECK-NOBMI-NEXT: orl %edx, %eax +; CHECK-NOBMI-NEXT: andl %edx, %esi +; CHECK-NOBMI-NEXT: movl %edx, %eax +; CHECK-NOBMI-NEXT: notl %eax +; CHECK-NOBMI-NEXT: orl %esi, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_mone_vary_invmask: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %esi, %eax -; CHECK-BMI-NEXT: andl %edx, %eax -; CHECK-BMI-NEXT: notl %edx -; CHECK-BMI-NEXT: orl %edx, %eax +; CHECK-BMI-NEXT: andl %edx, %esi +; CHECK-BMI-NEXT: movl %edx, %eax +; CHECK-BMI-NEXT: notl %eax +; CHECK-BMI-NEXT: orl %esi, %eax ; CHECK-BMI-NEXT: retq %notmask = xor i32 %mask, -1 %mx = and i32 %notmask, -1 @@ -850,20 +849,20 @@ define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) { ; CHECK-NOBMI-LABEL: out_constant_42_vary_invmask: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %esi, %eax -; CHECK-NOBMI-NEXT: andl %edx, %eax -; CHECK-NOBMI-NEXT: notl %edx -; CHECK-NOBMI-NEXT: andl $42, %edx -; CHECK-NOBMI-NEXT: orl %edx, %eax +; CHECK-NOBMI-NEXT: andl %edx, %esi +; CHECK-NOBMI-NEXT: movl %edx, %eax +; CHECK-NOBMI-NEXT: notl %eax +; CHECK-NOBMI-NEXT: andl $42, %eax +; CHECK-NOBMI-NEXT: orl %esi, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_42_vary_invmask: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %esi, %eax -; CHECK-BMI-NEXT: andl %edx, %eax -; CHECK-BMI-NEXT: notl %edx -; CHECK-BMI-NEXT: andl $42, %edx -; CHECK-BMI-NEXT: orl %edx, %eax +; CHECK-BMI-NEXT: andl %edx, %esi +; CHECK-BMI-NEXT: movl %edx, %eax +; CHECK-BMI-NEXT: notl %eax +; CHECK-BMI-NEXT: andl $42, %eax +; CHECK-BMI-NEXT: orl %esi, %eax ; CHECK-BMI-NEXT: retq %notmask = xor i32 %mask, -1 %mx = and i32 %notmask, 42 Index: test/CodeGen/X86/usub_sat_vec.ll =================================================================== --- test/CodeGen/X86/usub_sat_vec.ll +++ test/CodeGen/X86/usub_sat_vec.ll @@ -783,48 +783,46 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; SSE2-LABEL: v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: psubd %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: psubd %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: psubd %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm2, %xmm4 -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: psubd %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm2, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: psubd %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm3, %xmm4 +; SSSE3-NEXT: por %xmm4, %xmm1 ; SSSE3-NEXT: psubd %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i32: @@ -864,86 +862,82 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; SSE2-LABEL: v16i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: psubd %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm8 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: psubd %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: psubd %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm9 -; SSE2-NEXT: por %xmm9, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: psubd %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: psubd %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm2 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v16i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm4, %xmm9 +; SSSE3-NEXT: pxor %xmm8, %xmm9 ; SSSE3-NEXT: movdqa %xmm0, %xmm10 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm10 -; SSSE3-NEXT: pandn %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 +; SSSE3-NEXT: pand %xmm10, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm10 ; SSSE3-NEXT: por %xmm10, %xmm0 ; SSSE3-NEXT: psubd %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm8, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm8 -; SSSE3-NEXT: pandn %xmm5, %xmm1 -; SSSE3-NEXT: por %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: pxor %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm5, %xmm4 +; SSSE3-NEXT: por %xmm4, %xmm1 ; SSSE3-NEXT: psubd %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm6, %xmm5 -; SSSE3-NEXT: pxor %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pandn %xmm6, %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: psubd %xmm6, %xmm4 -; SSSE3-NEXT: movdqa %xmm7, %xmm2 -; SSSE3-NEXT: pxor %xmm9, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9 -; SSSE3-NEXT: pand %xmm9, %xmm3 -; SSSE3-NEXT: pandn %xmm7, %xmm9 -; SSSE3-NEXT: por %xmm9, %xmm3 +; SSSE3-NEXT: movdqa %xmm6, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm6, %xmm5 +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: psubd %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm7, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm3 +; SSSE3-NEXT: pandn %xmm7, %xmm8 +; SSSE3-NEXT: por %xmm8, %xmm3 ; SSSE3-NEXT: psubd %xmm7, %xmm3 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v16i32: @@ -1160,17 +1154,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: pxor %xmm6, %xmm5 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm6, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE41-NEXT: pand %xmm8, %xmm5 -; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: movdqa %xmm2, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5 ; SSE41-NEXT: psubq %xmm2, %xmm5 @@ -1371,17 +1365,17 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm8 ; SSE41-NEXT: movdqa %xmm0, %xmm11 ; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm4, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm9 -; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm1 ; SSE41-NEXT: pxor %xmm10, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pand %xmm12, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm12, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm9 ; SSE41-NEXT: psubq %xmm4, %xmm9 Index: test/CodeGen/X86/vec_ctbits.ll =================================================================== --- test/CodeGen/X86/vec_ctbits.ll +++ test/CodeGen/X86/vec_ctbits.ll @@ -23,11 +23,10 @@ ; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 -; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: psadbw %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: paddb %xmm1, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true) ret <2 x i64> %c @@ -93,11 +92,10 @@ ; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 -; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: psadbw %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: paddb %xmm1, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) ret <2 x i64> %c @@ -126,11 +124,10 @@ ; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 -; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: psadbw %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: paddb %xmm1, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false) ret <2 x i32> %c @@ -187,23 +184,22 @@ ; CHECK-LABEL: prompop: ; CHECK: # %bb.0: ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pxor %xmm2, %xmm2 -; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlw $1, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: psubb %xmm1, %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: psrlw $1, %xmm2 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-NEXT: psubb %xmm2, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm0, %xmm3 -; CHECK-NEXT: pand %xmm1, %xmm3 +; CHECK-NEXT: pand %xmm2, %xmm3 ; CHECK-NEXT: psrlw $2, %xmm0 -; CHECK-NEXT: pand %xmm1, %xmm0 +; CHECK-NEXT: pand %xmm2, %xmm0 ; CHECK-NEXT: paddb %xmm3, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlw $4, %xmm1 -; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: psadbw %xmm2, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: psrlw $4, %xmm2 +; CHECK-NEXT: paddb %xmm2, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) ret <2 x i32> %c Index: test/CodeGen/X86/vec_minmax_sint.ll =================================================================== --- test/CodeGen/X86/vec_minmax_sint.ll +++ test/CodeGen/X86/vec_minmax_sint.ll @@ -121,17 +121,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -195,8 +195,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_gt_v4i32: @@ -225,14 +224,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_gt_v8i32: @@ -322,8 +319,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_gt_v16i8: @@ -352,14 +348,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_gt_v32i8: @@ -511,17 +505,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -585,8 +579,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_ge_v4i32: @@ -615,14 +608,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_ge_v8i32: @@ -712,8 +703,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_ge_v16i8: @@ -742,14 +732,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_ge_v32i8: Index: test/CodeGen/X86/vec_minmax_uint.ll =================================================================== --- test/CodeGen/X86/vec_minmax_uint.ll +++ test/CodeGen/X86/vec_minmax_uint.ll @@ -131,17 +131,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -248,23 +248,22 @@ define <8 x i32> @max_gt_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: max_gt_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_gt_v8i32: @@ -549,17 +548,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -666,23 +665,22 @@ define <8 x i32> @max_ge_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: max_ge_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_ge_v8i32: Index: test/CodeGen/X86/vec_sdiv_to_shift.ll =================================================================== --- test/CodeGen/X86/vec_sdiv_to_shift.ll +++ test/CodeGen/X86/vec_sdiv_to_shift.ll @@ -9,9 +9,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psraw $15, %xmm1 ; SSE-NEXT: psrlw $11, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: psraw $5, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: psraw $5, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sdiv_vec8x16: @@ -32,9 +31,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psraw $15, %xmm1 ; SSE-NEXT: psrlw $11, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: psraw $5, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: psraw $5, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sdiv_vec8x16_minsize: @@ -55,9 +53,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrad $31, %xmm1 ; SSE-NEXT: psrld $28, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: psrad $4, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: psrad $4, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sdiv_vec4x32: @@ -104,15 +101,13 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrad $31, %xmm2 ; SSE-NEXT: psrld $26, %xmm2 -; SSE-NEXT: paddd %xmm0, %xmm2 -; SSE-NEXT: psrad $6, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrad $31, %xmm3 -; SSE-NEXT: psrld $26, %xmm3 -; SSE-NEXT: paddd %xmm1, %xmm3 -; SSE-NEXT: psrad $6, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: psrad $6, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: psrld $26, %xmm2 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: psrad $6, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: sdiv8x32: @@ -147,15 +142,13 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psraw $15, %xmm2 ; SSE-NEXT: psrlw $14, %xmm2 -; SSE-NEXT: paddw %xmm0, %xmm2 -; SSE-NEXT: psraw $2, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psraw $15, %xmm3 -; SSE-NEXT: psrlw $14, %xmm3 -; SSE-NEXT: paddw %xmm1, %xmm3 -; SSE-NEXT: psraw $2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm0 +; SSE-NEXT: psraw $2, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psraw $15, %xmm2 +; SSE-NEXT: psrlw $14, %xmm2 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: psraw $2, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: sdiv16x16: Index: test/CodeGen/X86/vec_shift6.ll =================================================================== --- test/CodeGen/X86/vec_shift6.ll +++ test/CodeGen/X86/vec_shift6.ll @@ -77,8 +77,7 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pslld $1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: test4: Index: test/CodeGen/X86/vector-bitreverse.ll =================================================================== --- test/CodeGen/X86/vector-bitreverse.ll +++ test/CodeGen/X86/vector-bitreverse.ll @@ -14,40 +14,36 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; SSE-LABEL: test_bitreverse_i8: ; SSE: # %bb.0: +; SSE-NEXT: rolb $4, %dil ; SSE-NEXT: movl %edi, %eax -; SSE-NEXT: rolb $4, %al -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andb $51, %cl -; SSE-NEXT: shlb $2, %cl -; SSE-NEXT: andb $-52, %al -; SSE-NEXT: shrb $2, %al -; SSE-NEXT: orb %cl, %al -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andb $85, %cl -; SSE-NEXT: addb %cl, %cl -; SSE-NEXT: andb $-86, %al -; SSE-NEXT: shrb %al -; SSE-NEXT: orb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: andb $51, %al +; SSE-NEXT: shlb $2, %al +; SSE-NEXT: andb $-52, %dil +; SSE-NEXT: shrb $2, %dil +; SSE-NEXT: orb %al, %dil +; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: andb $85, %al +; SSE-NEXT: addb %al, %al +; SSE-NEXT: andb $-86, %dil +; SSE-NEXT: shrb %dil +; SSE-NEXT: orb %dil, %al ; SSE-NEXT: retq ; ; AVX-LABEL: test_bitreverse_i8: ; AVX: # %bb.0: +; AVX-NEXT: rolb $4, %dil ; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: rolb $4, %al -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: andb $51, %cl -; AVX-NEXT: shlb $2, %cl -; AVX-NEXT: andb $-52, %al -; AVX-NEXT: shrb $2, %al -; AVX-NEXT: orb %cl, %al -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: andb $85, %cl -; AVX-NEXT: addb %cl, %cl -; AVX-NEXT: andb $-86, %al -; AVX-NEXT: shrb %al -; AVX-NEXT: orb %cl, %al -; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: andb $51, %al +; AVX-NEXT: shlb $2, %al +; AVX-NEXT: andb $-52, %dil +; AVX-NEXT: shrb $2, %dil +; AVX-NEXT: orb %al, %dil +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: andb $85, %al +; AVX-NEXT: addb %al, %al +; AVX-NEXT: andb $-86, %dil +; AVX-NEXT: shrb %dil +; AVX-NEXT: orb %dil, %al ; AVX-NEXT: retq ; ; XOP-LABEL: test_bitreverse_i8: Index: test/CodeGen/X86/vector-ext-logic.ll =================================================================== --- test/CodeGen/X86/vector-ext-logic.ll +++ test/CodeGen/X86/vector-ext-logic.ll @@ -355,21 +355,21 @@ define <8 x i32> @bool_sext_and(<8 x i1> %x, <8 x i1> %y) { ; SSE2-LABEL: bool_sext_and: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -393,21 +393,21 @@ define <8 x i32> @bool_sext_or(<8 x i1> %x, <8 x i1> %y) { ; SSE2-LABEL: bool_sext_or: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -431,21 +431,21 @@ define <8 x i32> @bool_sext_xor(<8 x i1> %x, <8 x i1> %y) { ; SSE2-LABEL: bool_sext_xor: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq Index: test/CodeGen/X86/vector-fshl-128.ll =================================================================== --- test/CodeGen/X86/vector-fshl-128.ll +++ test/CodeGen/X86/vector-fshl-128.ll @@ -1919,21 +1919,21 @@ ; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psllw %xmm5, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: psllw %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm7, %xmm7 ; SSE41-NEXT: psllw %xmm5, %xmm7 ; SSE41-NEXT: pshufb %xmm0, %xmm7 -; SSE41-NEXT: pand %xmm7, %xmm4 +; SSE41-NEXT: pand %xmm6, %xmm7 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SSE41-NEXT: psubb %xmm2, %xmm5 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: psrlw %xmm5, %xmm1 -; SSE41-NEXT: psrlw %xmm5, %xmm6 -; SSE41-NEXT: pshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm1, %xmm6 -; SSE41-NEXT: por %xmm6, %xmm4 +; SSE41-NEXT: psrlw %xmm5, %xmm4 +; SSE41-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm1, %xmm4 +; SSE41-NEXT: por %xmm7, %xmm4 ; SSE41-NEXT: pcmpeqb %xmm2, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 @@ -2560,8 +2560,8 @@ ; SSE2-NEXT: pmullw %xmm4, %xmm3 ; SSE2-NEXT: psrlw $8, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] -; SSE2-NEXT: pmullw %xmm5, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] +; SSE2-NEXT: pmullw %xmm2, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -2569,17 +2569,17 @@ ; SSE2-NEXT: pmullw %xmm4, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm5, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: pmullw %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: packuswb %xmm3, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm5 ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_funnnel_v16i8: @@ -2758,8 +2758,8 @@ ; X32-SSE-NEXT: pmullw %xmm4, %xmm3 ; X32-SSE-NEXT: psrlw $8, %xmm3 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] -; X32-SSE-NEXT: pmullw %xmm5, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] +; X32-SSE-NEXT: pmullw %xmm2, %xmm1 ; X32-SSE-NEXT: psrlw $8, %xmm1 ; X32-SSE-NEXT: packuswb %xmm3, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 @@ -2767,17 +2767,17 @@ ; X32-SSE-NEXT: pmullw %xmm4, %xmm3 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; X32-SSE-NEXT: pand %xmm4, %xmm3 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE-NEXT: pmullw %xmm5, %xmm2 -; X32-SSE-NEXT: pand %xmm4, %xmm2 -; X32-SSE-NEXT: packuswb %xmm3, %xmm2 -; X32-SSE-NEXT: por %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa %xmm0, %xmm5 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; X32-SSE-NEXT: pmullw %xmm2, %xmm5 +; X32-SSE-NEXT: pand %xmm4, %xmm5 +; X32-SSE-NEXT: packuswb %xmm3, %xmm5 +; X32-SSE-NEXT: por %xmm1, %xmm5 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; X32-SSE-NEXT: pand %xmm1, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm5 ; X32-SSE-NEXT: pandn %xmm0, %xmm1 -; X32-SSE-NEXT: por %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm5, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> ) ret <16 x i8> %res Index: test/CodeGen/X86/vector-fshl-rot-128.ll =================================================================== --- test/CodeGen/X86/vector-fshl-rot-128.ll +++ test/CodeGen/X86/vector-fshl-rot-128.ll @@ -1012,28 +1012,27 @@ ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SSE2-NEXT: psubb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psllw %xmm3, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psllw %xmm1, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: psllw %xmm3, %xmm5 +; SSE2-NEXT: psllw %xmm1, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: psrlw %xmm2, %xmm0 ; SSE2-NEXT: psrlw %xmm2, %xmm4 ; SSE2-NEXT: psrlw $8, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1043,21 +1042,21 @@ ; SSE41-NEXT: pshufb %xmm3, %xmm1 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psllw %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE41-NEXT: psllw %xmm4, %xmm6 ; SSE41-NEXT: pshufb %xmm3, %xmm6 -; SSE41-NEXT: pand %xmm6, %xmm2 +; SSE41-NEXT: pand %xmm5, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SSE41-NEXT: psubb %xmm1, %xmm3 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: psrlw %xmm1, %xmm0 -; SSE41-NEXT: psrlw %xmm1, %xmm5 -; SSE41-NEXT: pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm0, %xmm5 -; SSE41-NEXT: por %xmm5, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: por %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -1197,28 +1196,27 @@ ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; X32-SSE-NEXT: psubb %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa %xmm1, %xmm3 -; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psllw %xmm3, %xmm1 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: psllw %xmm1, %xmm3 ; X32-SSE-NEXT: pcmpeqd %xmm4, %xmm4 ; X32-SSE-NEXT: pcmpeqd %xmm5, %xmm5 -; X32-SSE-NEXT: psllw %xmm3, %xmm5 +; X32-SSE-NEXT: psllw %xmm1, %xmm5 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; X32-SSE-NEXT: pand %xmm3, %xmm1 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,0,0] +; X32-SSE-NEXT: pand %xmm3, %xmm5 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X32-SSE-NEXT: psrlw %xmm2, %xmm0 ; X32-SSE-NEXT: psrlw %xmm2, %xmm4 ; X32-SSE-NEXT: psrlw $8, %xmm4 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand %xmm0, %xmm2 -; X32-SSE-NEXT: por %xmm2, %xmm1 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: pand %xmm0, %xmm1 +; X32-SSE-NEXT: por %xmm5, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer Index: test/CodeGen/X86/vector-fshr-rot-128.ll =================================================================== --- test/CodeGen/X86/vector-fshr-rot-128.ll +++ test/CodeGen/X86/vector-fshr-rot-128.ll @@ -1085,28 +1085,28 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: psubb %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE2-NEXT: psubb %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE2-NEXT: psubb %xmm2, %xmm1 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psllw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psllw %xmm2, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE2-NEXT: psllw %xmm2, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,0,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrlw %xmm3, %xmm0 -; SSE2-NEXT: psrlw %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: psrlw %xmm1, %xmm4 ; SSE2-NEXT: psrlw $8, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1119,21 +1119,21 @@ ; SSE41-NEXT: psubb %xmm1, %xmm3 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllw %xmm4, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psllw %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE41-NEXT: psllw %xmm4, %xmm6 ; SSE41-NEXT: pshufb %xmm2, %xmm6 -; SSE41-NEXT: pand %xmm6, %xmm1 +; SSE41-NEXT: pand %xmm5, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SSE41-NEXT: psubb %xmm3, %xmm2 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: psrlw %xmm2, %xmm0 -; SSE41-NEXT: psrlw %xmm2, %xmm5 -; SSE41-NEXT: pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm0, %xmm5 -; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: psrlw %xmm2, %xmm1 +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: por %xmm6, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -1279,28 +1279,28 @@ ; X32-SSE-NEXT: pxor %xmm2, %xmm2 ; X32-SSE-NEXT: psubb %xmm1, %xmm2 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X32-SSE-NEXT: psubb %xmm2, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X32-SSE-NEXT: psubb %xmm2, %xmm1 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psllw %xmm2, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: psllw %xmm2, %xmm3 ; X32-SSE-NEXT: pcmpeqd %xmm4, %xmm4 ; X32-SSE-NEXT: pcmpeqd %xmm5, %xmm5 ; X32-SSE-NEXT: psllw %xmm2, %xmm5 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,0,2,3,4,5,6,7] ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psrlw %xmm3, %xmm0 -; X32-SSE-NEXT: psrlw %xmm3, %xmm4 +; X32-SSE-NEXT: pand %xmm3, %xmm2 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: psrlw %xmm1, %xmm4 ; X32-SSE-NEXT: psrlw $8, %xmm4 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand %xmm0, %xmm2 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: pand %xmm0, %xmm1 ; X32-SSE-NEXT: por %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl Index: test/CodeGen/X86/vector-idiv-udiv-128.ll =================================================================== --- test/CodeGen/X86/vector-idiv-udiv-128.ll +++ test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -554,8 +554,7 @@ ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pslld $3, %xmm2 ; SSE2-NEXT: psubd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_rem7_4i32: @@ -664,8 +663,7 @@ ; SSE2-NEXT: psllw $3, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: psubb %xmm2, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_rem7_16i8: @@ -691,8 +689,7 @@ ; SSE41-NEXT: psllw $3, %xmm2 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE41-NEXT: psubb %xmm2, %xmm1 -; SSE41-NEXT: paddb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_rem7_16i8: Index: test/CodeGen/X86/vector-lzcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-lzcnt-128.ll +++ test/CodeGen/X86/vector-lzcnt-128.ll @@ -123,8 +123,7 @@ ; SSSE3-NEXT: psrlq $32, %xmm0 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: psrlq $32, %xmm1 -; SSSE3-NEXT: paddq %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv2i64: @@ -156,8 +155,7 @@ ; SSE41-NEXT: psrlq $32, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm1 -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv2i64: @@ -258,8 +256,7 @@ ; X32-SSE-NEXT: psrlq $32, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrlq $32, %xmm1 -; X32-SSE-NEXT: paddq %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: paddq %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0) @@ -376,8 +373,7 @@ ; SSSE3-NEXT: psrlq $32, %xmm0 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: psrlq $32, %xmm1 -; SSSE3-NEXT: paddq %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv2i64u: @@ -409,8 +405,7 @@ ; SSE41-NEXT: psrlq $32, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm1 -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv2i64u: @@ -511,8 +506,7 @@ ; X32-SSE-NEXT: psrlq $32, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrlq $32, %xmm1 -; X32-SSE-NEXT: paddq %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: paddq %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 -1) @@ -606,56 +600,54 @@ ; ; SSSE3-LABEL: testv4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psrlw $4, %xmm3 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: paddb %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: paddb %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 -; SSSE3-NEXT: psrlw $8, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: psrlw $8, %xmm3 +; SSSE3-NEXT: paddw %xmm1, %xmm3 ; SSSE3-NEXT: pcmpeqw %xmm4, %xmm0 ; SSSE3-NEXT: psrld $16, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: psrld $16, %xmm1 -; SSSE3-NEXT: paddd %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: psrld $16, %xmm3 +; SSSE3-NEXT: paddd %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 ; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm2, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm3 ; SSE41-NEXT: pcmpeqw %xmm4, %xmm0 ; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: paddd %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv4i32: @@ -719,29 +711,28 @@ ; ; X32-SSE-LABEL: testv4i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pshufb %xmm0, %xmm3 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlw $4, %xmm1 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: pshufb %xmm0, %xmm2 +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: psrlw $4, %xmm3 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3 ; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm2 +; X32-SSE-NEXT: pshufb %xmm3, %xmm1 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: paddb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 -; X32-SSE-NEXT: pand %xmm3, %xmm1 -; X32-SSE-NEXT: paddb %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 -; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm2, %xmm1 +; X32-SSE-NEXT: pand %xmm3, %xmm1 +; X32-SSE-NEXT: psrlw $8, %xmm3 +; X32-SSE-NEXT: paddw %xmm1, %xmm3 ; X32-SSE-NEXT: pcmpeqw %xmm4, %xmm0 ; X32-SSE-NEXT: psrld $16, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: psrld $16, %xmm1 -; X32-SSE-NEXT: paddd %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: psrld $16, %xmm3 +; X32-SSE-NEXT: paddd %xmm3, %xmm0 ; X32-SSE-NEXT: retl %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 0) @@ -835,56 +826,54 @@ ; ; SSSE3-LABEL: testv4i32u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psrlw $4, %xmm3 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: paddb %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: paddb %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 -; SSSE3-NEXT: psrlw $8, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: psrlw $8, %xmm3 +; SSSE3-NEXT: paddw %xmm1, %xmm3 ; SSSE3-NEXT: pcmpeqw %xmm4, %xmm0 ; SSSE3-NEXT: psrld $16, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: psrld $16, %xmm1 -; SSSE3-NEXT: paddd %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: psrld $16, %xmm3 +; SSSE3-NEXT: paddd %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv4i32u: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 ; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm2, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm3 ; SSE41-NEXT: pcmpeqw %xmm4, %xmm0 ; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: paddd %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv4i32u: @@ -948,29 +937,28 @@ ; ; X32-SSE-LABEL: testv4i32u: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pshufb %xmm0, %xmm3 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlw $4, %xmm1 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: pshufb %xmm0, %xmm2 +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: psrlw $4, %xmm3 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3 ; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm2 +; X32-SSE-NEXT: pshufb %xmm3, %xmm1 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: paddb %xmm1, %xmm3 +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 -; X32-SSE-NEXT: pand %xmm3, %xmm1 -; X32-SSE-NEXT: paddb %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 -; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm2, %xmm1 +; X32-SSE-NEXT: pand %xmm3, %xmm1 +; X32-SSE-NEXT: psrlw $8, %xmm3 +; X32-SSE-NEXT: paddw %xmm1, %xmm3 ; X32-SSE-NEXT: pcmpeqw %xmm4, %xmm0 ; X32-SSE-NEXT: psrld $16, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: psrld $16, %xmm1 -; X32-SSE-NEXT: paddd %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: psrld $16, %xmm3 +; X32-SSE-NEXT: paddd %xmm3, %xmm0 ; X32-SSE-NEXT: retl %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 -1) @@ -1052,44 +1040,42 @@ ; ; SSSE3-LABEL: testv8i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psrlw $4, %xmm3 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: paddb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: paddb %xmm1, %xmm3 ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: psrlw $8, %xmm3 +; SSSE3-NEXT: paddw %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 ; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm3 ; SSE41-NEXT: pcmpeqb %xmm4, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: paddw %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv8i16: @@ -1148,23 +1134,22 @@ ; ; X32-SSE-LABEL: testv8i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pshufb %xmm0, %xmm3 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlw $4, %xmm1 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: pshufb %xmm0, %xmm2 +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: psrlw $4, %xmm3 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3 ; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 -; X32-SSE-NEXT: pand %xmm3, %xmm1 -; X32-SSE-NEXT: paddb %xmm2, %xmm1 +; X32-SSE-NEXT: pshufb %xmm3, %xmm1 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: paddb %xmm1, %xmm3 ; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm0 ; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm3 +; X32-SSE-NEXT: paddw %xmm3, %xmm0 ; X32-SSE-NEXT: retl %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0) ret <8 x i16> %out @@ -1245,44 +1230,42 @@ ; ; SSSE3-LABEL: testv8i16u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psrlw $4, %xmm3 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: paddb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: paddb %xmm1, %xmm3 ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: psrlw $8, %xmm3 +; SSSE3-NEXT: paddw %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv8i16u: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 ; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm3 ; SSE41-NEXT: pcmpeqb %xmm4, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: paddw %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv8i16u: @@ -1341,23 +1324,22 @@ ; ; X32-SSE-LABEL: testv8i16u: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pshufb %xmm0, %xmm3 -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlw $4, %xmm1 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: pshufb %xmm0, %xmm2 +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: psrlw $4, %xmm3 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3 ; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 -; X32-SSE-NEXT: pand %xmm3, %xmm1 -; X32-SSE-NEXT: paddb %xmm2, %xmm1 +; X32-SSE-NEXT: pshufb %xmm3, %xmm1 +; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: paddb %xmm1, %xmm3 ; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm0 ; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: psrlw $8, %xmm3 +; X32-SSE-NEXT: paddw %xmm3, %xmm0 ; X32-SSE-NEXT: retl %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1) ret <8 x i16> %out Index: test/CodeGen/X86/vector-mul.ll =================================================================== --- test/CodeGen/X86/vector-mul.ll +++ test/CodeGen/X86/vector-mul.ll @@ -230,16 +230,14 @@ ; X86: # %bb.0: ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psllq $4, %xmm1 -; X86-NEXT: paddq %xmm0, %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm0 +; X86-NEXT: paddq %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v2i64_17: ; X64: # %bb.0: ; X64-NEXT: movdqa %xmm0, %xmm1 ; X64-NEXT: psllq $4, %xmm1 -; X64-NEXT: paddq %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: paddq %xmm1, %xmm0 ; X64-NEXT: retq ; ; X64-AVX-LABEL: mul_v2i64_17: @@ -301,8 +299,7 @@ ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psllw $4, %xmm1 ; X86-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X86-NEXT: paddb %xmm0, %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm0 +; X86-NEXT: paddb %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v16i8_17: @@ -310,8 +307,7 @@ ; X64-NEXT: movdqa %xmm0, %xmm1 ; X64-NEXT: psllw $4, %xmm1 ; X64-NEXT: pand {{.*}}(%rip), %xmm1 -; X64-NEXT: paddb %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: paddb %xmm1, %xmm0 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v16i8_17: Index: test/CodeGen/X86/vector-popcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-popcnt-128.ll +++ test/CodeGen/X86/vector-popcnt-128.ll @@ -25,11 +25,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv2i64: @@ -46,11 +45,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: psadbw %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv2i64: @@ -475,9 +473,8 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv16i8: @@ -494,9 +491,8 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv16i8: Index: test/CodeGen/X86/vector-reduce-fadd-fast.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -188,8 +188,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_zero: @@ -230,8 +229,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32_zero: @@ -272,8 +270,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32_zero: @@ -324,8 +321,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32_zero: @@ -365,8 +361,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_undef: @@ -407,8 +402,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32_undef: @@ -449,8 +443,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32_undef: @@ -501,8 +494,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32_undef: @@ -676,8 +668,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_zero: @@ -701,8 +692,7 @@ ; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_zero: @@ -734,8 +724,7 @@ ; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_zero: @@ -813,8 +802,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_undef: @@ -838,8 +826,7 @@ ; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_undef: @@ -871,8 +858,7 @@ ; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_undef: Index: test/CodeGen/X86/vector-reduce-fmul-fast.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -188,8 +188,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE2-NEXT: mulps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: mulps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_zero: @@ -230,8 +229,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32_zero: @@ -272,8 +270,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32_zero: @@ -324,8 +321,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32_zero: @@ -365,8 +361,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE2-NEXT: mulps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: mulps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_undef: @@ -407,8 +402,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32_undef: @@ -449,8 +443,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32_undef: @@ -501,8 +494,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32_undef: @@ -676,8 +668,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_zero: @@ -701,8 +692,7 @@ ; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_zero: @@ -734,8 +724,7 @@ ; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_zero: @@ -813,8 +802,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_undef: @@ -838,8 +826,7 @@ ; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_undef: @@ -871,8 +858,7 @@ ; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_undef: Index: test/CodeGen/X86/vector-reduce-fmul.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fmul.ll +++ test/CodeGen/X86/vector-reduce-fmul.ll @@ -339,8 +339,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE2-NEXT: mulss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_one: @@ -1184,8 +1183,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_one: @@ -1208,11 +1206,10 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: mulsd %xmm0, %xmm2 -; SSE-NEXT: mulsd %xmm1, %xmm2 +; SSE-NEXT: mulsd %xmm2, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: mulsd %xmm1, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_one: @@ -1245,17 +1242,16 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: mulsd %xmm0, %xmm4 -; SSE-NEXT: mulsd %xmm1, %xmm4 +; SSE-NEXT: mulsd %xmm4, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: mulsd %xmm1, %xmm4 -; SSE-NEXT: mulsd %xmm2, %xmm4 +; SSE-NEXT: mulsd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm2, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: mulsd %xmm2, %xmm4 -; SSE-NEXT: mulsd %xmm3, %xmm4 +; SSE-NEXT: mulsd %xmm2, %xmm0 +; SSE-NEXT: mulsd %xmm3, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; SSE-NEXT: mulsd %xmm3, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm0 +; SSE-NEXT: mulsd %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_one: @@ -1302,7 +1298,7 @@ ; SSE-LABEL: test_v16f64_one: ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: mulsd %xmm8, %xmm0 ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] Index: test/CodeGen/X86/vector-reduce-smax-widen.ll =================================================================== --- test/CodeGen/X86/vector-reduce-smax-widen.ll +++ test/CodeGen/X86/vector-reduce-smax-widen.ll @@ -36,17 +36,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movq %xmm2, %rax ; SSE41-NEXT: retq @@ -119,17 +119,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm4 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -267,17 +267,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -522,17 +522,17 @@ ; SSE41-NEXT: pand %xmm11, %xmm0 ; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: pxor %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pxor %xmm9, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm10, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 Index: test/CodeGen/X86/vector-reduce-smax.ll =================================================================== --- test/CodeGen/X86/vector-reduce-smax.ll +++ test/CodeGen/X86/vector-reduce-smax.ll @@ -36,17 +36,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movq %xmm2, %rax ; SSE41-NEXT: retq @@ -119,17 +119,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm4 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -267,17 +267,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -522,17 +522,17 @@ ; SSE41-NEXT: pand %xmm11, %xmm0 ; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: pxor %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pxor %xmm9, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm10, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 Index: test/CodeGen/X86/vector-reduce-umax-widen.ll =================================================================== --- test/CodeGen/X86/vector-reduce-umax-widen.ll +++ test/CodeGen/X86/vector-reduce-umax-widen.ll @@ -36,17 +36,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movq %xmm2, %rax ; SSE41-NEXT: retq @@ -122,17 +122,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm4 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -280,17 +280,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -551,17 +551,17 @@ ; SSE41-NEXT: pand %xmm11, %xmm0 ; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: pxor %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pxor %xmm9, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm10, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 Index: test/CodeGen/X86/vector-reduce-umax.ll =================================================================== --- test/CodeGen/X86/vector-reduce-umax.ll +++ test/CodeGen/X86/vector-reduce-umax.ll @@ -36,17 +36,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movq %xmm2, %rax ; SSE41-NEXT: retq @@ -122,17 +122,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE41-NEXT: pand %xmm6, %xmm4 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -280,17 +280,17 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -551,17 +551,17 @@ ; SSE41-NEXT: pand %xmm11, %xmm0 ; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: pxor %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pxor %xmm9, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE41-NEXT: pand %xmm10, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 @@ -1796,16 +1796,16 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: por %xmm3, %xmm4 -; SSE41-NEXT: por %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: por %xmm0, %xmm3 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pextrb $0, %xmm2, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax Index: test/CodeGen/X86/vector-rotate-128.ll =================================================================== --- test/CodeGen/X86/vector-rotate-128.ll +++ test/CodeGen/X86/vector-rotate-128.ll @@ -973,28 +973,27 @@ ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SSE2-NEXT: psubb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psllw %xmm3, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psllw %xmm1, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: psllw %xmm3, %xmm5 +; SSE2-NEXT: psllw %xmm1, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: psrlw %xmm2, %xmm0 ; SSE2-NEXT: psrlw %xmm2, %xmm4 ; SSE2-NEXT: psrlw $8, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1004,21 +1003,21 @@ ; SSE41-NEXT: pshufb %xmm3, %xmm1 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psllw %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE41-NEXT: psllw %xmm4, %xmm6 ; SSE41-NEXT: pshufb %xmm3, %xmm6 -; SSE41-NEXT: pand %xmm6, %xmm2 +; SSE41-NEXT: pand %xmm5, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SSE41-NEXT: psubb %xmm1, %xmm3 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: psrlw %xmm1, %xmm0 -; SSE41-NEXT: psrlw %xmm1, %xmm5 -; SSE41-NEXT: pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm0, %xmm5 -; SSE41-NEXT: por %xmm5, %xmm2 +; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: por %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -1146,28 +1145,27 @@ ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; X32-SSE-NEXT: psubb %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa %xmm1, %xmm3 -; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psllw %xmm3, %xmm1 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: psllw %xmm1, %xmm3 ; X32-SSE-NEXT: pcmpeqd %xmm4, %xmm4 ; X32-SSE-NEXT: pcmpeqd %xmm5, %xmm5 -; X32-SSE-NEXT: psllw %xmm3, %xmm5 +; X32-SSE-NEXT: psllw %xmm1, %xmm5 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; X32-SSE-NEXT: pand %xmm3, %xmm1 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,0,0] +; X32-SSE-NEXT: pand %xmm3, %xmm5 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X32-SSE-NEXT: psrlw %xmm2, %xmm0 ; X32-SSE-NEXT: psrlw %xmm2, %xmm4 ; X32-SSE-NEXT: psrlw $8, %xmm4 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand %xmm0, %xmm2 -; X32-SSE-NEXT: por %xmm2, %xmm1 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: pand %xmm0, %xmm1 +; X32-SSE-NEXT: por %xmm5, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer Index: test/CodeGen/X86/vector-shift-ashr-128.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-128.ll +++ test/CodeGen/X86/vector-shift-ashr-128.ll @@ -1095,11 +1095,11 @@ ; SSE2-NEXT: psraw $2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm0, %xmm1 ; SSE2-NEXT: psraw $1, %xmm2 -; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 ; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1166,11 +1166,11 @@ ; X32-SSE-NEXT: psraw $2, %xmm1 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] ; X32-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X32-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; X32-SSE-NEXT: movaps %xmm2, %xmm0 -; X32-SSE-NEXT: andps %xmm1, %xmm0 +; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] +; X32-SSE-NEXT: movaps %xmm2, %xmm1 +; X32-SSE-NEXT: andps %xmm0, %xmm1 ; X32-SSE-NEXT: psraw $1, %xmm2 -; X32-SSE-NEXT: andnps %xmm2, %xmm1 +; X32-SSE-NEXT: andnps %xmm2, %xmm0 ; X32-SSE-NEXT: orps %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <8 x i16> %a, Index: test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll +++ test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll @@ -1796,11 +1796,11 @@ ; SSE2-NEXT: psraw $2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: andps %xmm0, %xmm2 ; SSE2-NEXT: psraw $1, %xmm1 -; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: andnps %xmm1, %xmm0 ; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: retq ; @@ -1887,11 +1887,11 @@ ; X32-SSE-NEXT: psraw $2, %xmm1 ; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] ; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; X32-SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535] -; X32-SSE-NEXT: movaps %xmm1, %xmm0 -; X32-SSE-NEXT: andps %xmm2, %xmm0 +; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,65535,65535,65535] +; X32-SSE-NEXT: movaps %xmm1, %xmm2 +; X32-SSE-NEXT: andps %xmm0, %xmm2 ; X32-SSE-NEXT: psraw $1, %xmm1 -; X32-SSE-NEXT: andnps %xmm1, %xmm2 +; X32-SSE-NEXT: andnps %xmm1, %xmm0 ; X32-SSE-NEXT: orps %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <4 x i16> %a, Index: test/CodeGen/X86/vector-shift-ashr-sub128.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -2205,11 +2205,11 @@ ; SSE2-NEXT: psraw $2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: andps %xmm0, %xmm2 ; SSE2-NEXT: psraw $1, %xmm1 -; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: andnps %xmm1, %xmm0 ; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: retq ; @@ -2220,9 +2220,9 @@ ; SSE41-NEXT: psraw $8, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = ; SSE41-NEXT: pmulhw %xmm1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] ; SSE41-NEXT: psraw $9, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: constant_shift_v8i8: @@ -2292,11 +2292,11 @@ ; X32-SSE-NEXT: psraw $2, %xmm0 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X32-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; X32-SSE-NEXT: movaps %xmm1, %xmm0 -; X32-SSE-NEXT: andps %xmm2, %xmm0 +; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] +; X32-SSE-NEXT: movaps %xmm1, %xmm2 +; X32-SSE-NEXT: andps %xmm0, %xmm2 ; X32-SSE-NEXT: psraw $1, %xmm1 -; X32-SSE-NEXT: andnps %xmm1, %xmm2 +; X32-SSE-NEXT: andnps %xmm1, %xmm0 ; X32-SSE-NEXT: orps %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <8 x i8> %a, Index: test/CodeGen/X86/vector-shift-lshr-sub128.ll =================================================================== --- test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -1902,15 +1902,15 @@ ; SSE41-LABEL: constant_shift_v4i8: ; SSE41: # %bb.0: ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $3, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrld $1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrld $2, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: psrld $3, %xmm0 -; SSE41-NEXT: psrld $1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v4i8: Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -2730,8 +2730,7 @@ ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE2-NEXT: addps %xmm0, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: addps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: PR22390: @@ -2739,8 +2738,7 @@ ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] ; SSSE3-NEXT: movaps %xmm0, %xmm2 ; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSSE3-NEXT: addps %xmm0, %xmm2 -; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: addps %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: PR22390: Index: test/CodeGen/X86/vector-tzcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-tzcnt-128.ll +++ test/CodeGen/X86/vector-tzcnt-128.ll @@ -33,11 +33,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv2i64: @@ -57,11 +56,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: psadbw %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv2i64: @@ -236,11 +234,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv2i64u: @@ -260,11 +257,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: psadbw %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv2i64u: @@ -1277,9 +1273,8 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv16i8: @@ -1299,9 +1294,8 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv16i8: @@ -1434,9 +1428,8 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv16i8u: @@ -1456,9 +1449,8 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv16i8u: Index: test/CodeGen/X86/vselect-minmax.ll =================================================================== --- test/CodeGen/X86/vselect-minmax.ll +++ test/CodeGen/X86/vselect-minmax.ll @@ -63,8 +63,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test3: @@ -89,8 +88,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test4: @@ -393,8 +391,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test19: @@ -419,8 +416,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test20: @@ -647,14 +643,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test27: @@ -694,14 +688,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test28: @@ -1255,14 +1247,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test43: @@ -1302,14 +1292,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test44: @@ -1447,23 +1435,22 @@ define <8 x i32> @test47(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: test47: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test47: @@ -1499,23 +1486,22 @@ define <8 x i32> @test48(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: test48: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test48: @@ -1555,8 +1541,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test49: @@ -1581,8 +1566,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test50: @@ -1885,8 +1869,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test65: @@ -1911,8 +1894,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test66: @@ -2099,14 +2081,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test73: @@ -2146,14 +2126,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test74: @@ -2707,14 +2685,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test89: @@ -2754,14 +2730,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test90: @@ -2887,23 +2861,22 @@ define <8 x i32> @test93(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: test93: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test93: @@ -2939,23 +2912,22 @@ define <8 x i32> @test94(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: test94: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test94: @@ -3225,26 +3197,22 @@ ; SSE2-NEXT: pcmpgtb %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test99: @@ -3292,26 +3260,22 @@ ; SSE2-NEXT: pcmpgtb %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test100: @@ -4037,26 +4001,22 @@ ; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test115: @@ -4104,26 +4064,22 @@ ; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test116: @@ -4317,40 +4273,38 @@ define <16 x i32> @test119(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: test119: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm8 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm9 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test119: @@ -4394,40 +4348,38 @@ define <16 x i32> @test120(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: test120: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm8 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm9 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test120: @@ -5555,26 +5507,22 @@ ; SSE2-NEXT: pcmpgtb %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test129: @@ -5622,26 +5570,22 @@ ; SSE2-NEXT: pcmpgtb %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test130: @@ -6367,26 +6311,22 @@ ; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test145: @@ -6434,26 +6374,22 @@ ; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test146: @@ -6623,40 +6559,38 @@ define <16 x i32> @test149(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: test149: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm8 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm9 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test149: @@ -6700,40 +6634,38 @@ define <16 x i32> @test150(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: test150: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm8 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm9 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test150: Index: test/CodeGen/X86/vselect-zero.ll =================================================================== --- test/CodeGen/X86/vselect-zero.ll +++ test/CodeGen/X86/vselect-zero.ll @@ -142,12 +142,11 @@ ; SSE2-LABEL: vsel_nonzero_constants: ; SSE2: # %bb.0: ; SSE2-NEXT: cmplepd %xmm0, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movapd %xmm1, %xmm2 -; SSE2-NEXT: andnpd %xmm0, %xmm2 -; SSE2-NEXT: andpd {{.*}}(%rip), %xmm1 -; SSE2-NEXT: orpd %xmm2, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: andpd {{.*}}(%rip), %xmm1 +; SSE2-NEXT: orpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: vsel_nonzero_constants: Index: test/CodeGen/X86/x86-shifts.ll =================================================================== --- test/CodeGen/X86/x86-shifts.ll +++ test/CodeGen/X86/x86-shifts.ll @@ -219,26 +219,24 @@ define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind { ; X32-LABEL: shr2_nosplat: ; X32: # %bb.0: # %entry -; X32-NEXT: movdqa %xmm0, %xmm2 -; X32-NEXT: psrlq $8, %xmm2 ; X32-NEXT: movdqa %xmm0, %xmm1 -; X32-NEXT: psrlq $1, %xmm1 -; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; X32-NEXT: xorpd %xmm0, %xmm1 -; X32-NEXT: movapd %xmm1, %xmm0 +; X32-NEXT: psrlq $8, %xmm1 +; X32-NEXT: movdqa %xmm0, %xmm2 +; X32-NEXT: psrlq $1, %xmm2 +; X32-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; X32-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; X32-NEXT: xorpd %xmm2, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: shr2_nosplat: ; X64: # %bb.0: # %entry -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: psrlq $8, %xmm2 ; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psrlq $1, %xmm1 -; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X64-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; X64-NEXT: xorpd %xmm0, %xmm1 -; X64-NEXT: movapd %xmm1, %xmm0 +; X64-NEXT: psrlq $8, %xmm1 +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: psrlq $1, %xmm2 +; X64-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; X64-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; X64-NEXT: xorpd %xmm2, %xmm0 ; X64-NEXT: retq entry: %B = lshr <2 x i64> %A, < i64 8, i64 1>