diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -23,6 +23,7 @@ let MicroOpBufferSize = 168; // Based on the reorder buffer. let LoadLatency = 5; let MispredictPenalty = 16; + let PostRAScheduler = 1; // Based on the LSD (loop-stream detector) queue size. let LoopMicroOpBufferSize = 28; diff --git a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll --- a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll +++ b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll @@ -9,15 +9,15 @@ ; CHECK-NEXT: subl $16, %esp ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: .cfi_offset %edi, -8 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %eax, (%esp) ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %eax, (%esp) +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: movq (%esp), %mm0 ; CHECK-NEXT: movq {{[0-9]+}}(%esp), %mm1 ; CHECK-NEXT: maskmovq %mm0, %mm1 diff --git a/llvm/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll b/llvm/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll --- a/llvm/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll +++ b/llvm/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll @@ -11,11 +11,11 @@ ; SOURCE-SCHED: # %bb.0: # %entry ; SOURCE-SCHED-NEXT: subl $12, %esp ; SOURCE-SCHED-NEXT: movl g_5, %eax -; SOURCE-SCHED-NEXT: sarl %eax ; SOURCE-SCHED-NEXT: xorl %ecx, %ecx +; SOURCE-SCHED-NEXT: movb g_73, %dl +; SOURCE-SCHED-NEXT: sarl %eax ; SOURCE-SCHED-NEXT: cmpl $1, %eax ; SOURCE-SCHED-NEXT: setg %cl -; SOURCE-SCHED-NEXT: movb g_73, %dl ; SOURCE-SCHED-NEXT: xorl %eax, %eax ; SOURCE-SCHED-NEXT: subb {{[0-9]+}}(%esp), %al ; SOURCE-SCHED-NEXT: testb %dl, %dl diff --git a/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll b/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll --- a/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll +++ b/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll @@ -22,207 +22,207 @@ ; CHECK-NEXT: movq {{.*}}(%rip), %r14 ; CHECK-NEXT: movq {{.*}}(%rip), %r11 ; CHECK-NEXT: movq {{.*}}(%rip), %rdx -; CHECK-NEXT: addq %r15, %rdx ; CHECK-NEXT: movq {{.*}}(%rip), %rsi -; CHECK-NEXT: bswapq %rsi +; CHECK-NEXT: movq {{.*}}(%rip), %rdi ; CHECK-NEXT: leaq (%r11,%r14), %rbx +; CHECK-NEXT: addq %r15, %rdx +; CHECK-NEXT: bswapq %rsi +; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: addq %r15, %rbx ; CHECK-NEXT: addq %rdx, %rbx +; CHECK-NEXT: addq %r11, %rdi +; CHECK-NEXT: movq {{.*}}(%rip), %r11 +; CHECK-NEXT: movq {{.*}}(%rip), %rcx ; CHECK-NEXT: addq %rsi, %rbx ; CHECK-NEXT: leaq (%r9,%r10), %rsi ; CHECK-NEXT: leaq (%rsi,%r8), %rdx -; CHECK-NEXT: addq %rsi, %rdx -; CHECK-NEXT: movq {{.*}}(%rip), %rdi ; CHECK-NEXT: addq %rbx, %r12 -; CHECK-NEXT: addq %r8, %rdx -; CHECK-NEXT: bswapq %rdi -; CHECK-NEXT: addq %rbx, %rdx +; CHECK-NEXT: addq %rsi, %rdx ; CHECK-NEXT: leaq (%r15,%r14), %rsi +; CHECK-NEXT: addq %r8, %rdx ; CHECK-NEXT: addq %r12, %rsi -; CHECK-NEXT: addq %r11, %rdi -; CHECK-NEXT: addq %rsi, %rdi +; CHECK-NEXT: addq %rbx, %rdx ; CHECK-NEXT: leaq (%r10,%r8), %rbx +; CHECK-NEXT: addq %rsi, %rdi +; CHECK-NEXT: bswapq %r11 +; CHECK-NEXT: bswapq %rcx ; CHECK-NEXT: leaq (%rdx,%rbx), %rsi -; CHECK-NEXT: addq %rbx, %rsi -; CHECK-NEXT: movq {{.*}}(%rip), %rbx ; CHECK-NEXT: addq %r12, %rdi +; CHECK-NEXT: leaq (%rdx,%r8), %rax +; CHECK-NEXT: addq %rbx, %rsi ; CHECK-NEXT: addq %rdi, %r9 +; CHECK-NEXT: addq %r14, %r11 +; CHECK-NEXT: addq %r15, %rcx ; CHECK-NEXT: addq %rdx, %rsi ; CHECK-NEXT: addq %rdi, %rsi -; CHECK-NEXT: bswapq %rbx ; CHECK-NEXT: leaq (%r12,%r15), %rdi ; CHECK-NEXT: addq %r9, %rdi -; CHECK-NEXT: addq %r14, %rbx -; CHECK-NEXT: addq %rdi, %rbx -; CHECK-NEXT: leaq (%rdx,%r8), %rax +; CHECK-NEXT: leaq (%rsi,%rdx), %rbx +; CHECK-NEXT: addq %rdi, %r11 ; CHECK-NEXT: leaq (%rsi,%rax), %rdi ; CHECK-NEXT: addq %rax, %rdi -; CHECK-NEXT: movq {{.*}}(%rip), %rcx -; CHECK-NEXT: addq %r9, %rbx -; CHECK-NEXT: addq %rbx, %r10 -; CHECK-NEXT: addq %rsi, %rdi -; CHECK-NEXT: bswapq %rcx -; CHECK-NEXT: addq %rbx, %rdi +; CHECK-NEXT: addq %r9, %r11 ; CHECK-NEXT: leaq (%r9,%r12), %rax +; CHECK-NEXT: addq %rsi, %rdi +; CHECK-NEXT: addq %r11, %r10 +; CHECK-NEXT: addq %r11, %rdi ; CHECK-NEXT: addq %r10, %rax -; CHECK-NEXT: addq %r15, %rcx -; CHECK-NEXT: addq %rax, %rcx -; CHECK-NEXT: leaq (%rsi,%rdx), %rbx ; CHECK-NEXT: leaq (%rdi,%rbx), %r11 +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: leaq (%rdi,%rsi), %rax ; CHECK-NEXT: addq %rbx, %r11 ; CHECK-NEXT: movq {{.*}}(%rip), %rbx ; CHECK-NEXT: addq %r10, %rcx -; CHECK-NEXT: addq %rcx, %r8 ; CHECK-NEXT: addq %rdi, %r11 +; CHECK-NEXT: addq %rcx, %r8 ; CHECK-NEXT: addq %rcx, %r11 -; CHECK-NEXT: bswapq %rbx ; CHECK-NEXT: leaq (%r10,%r9), %rcx -; CHECK-NEXT: addq %r8, %rcx -; CHECK-NEXT: addq %r12, %rbx -; CHECK-NEXT: addq %rcx, %rbx -; CHECK-NEXT: leaq (%rdi,%rsi), %rax ; CHECK-NEXT: leaq (%r11,%rax), %r14 +; CHECK-NEXT: addq %r8, %rcx ; CHECK-NEXT: addq %rax, %r14 ; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: addq %r8, %rbx -; CHECK-NEXT: addq %rbx, %rdx +; CHECK-NEXT: movq {{.*}}(%rip), %r15 +; CHECK-NEXT: bswapq %rbx ; CHECK-NEXT: addq %r11, %r14 +; CHECK-NEXT: addq %r12, %rbx +; CHECK-NEXT: addq %rcx, %rbx +; CHECK-NEXT: addq %r8, %rbx ; CHECK-NEXT: bswapq %rax +; CHECK-NEXT: bswapq %r15 +; CHECK-NEXT: addq %rbx, %rdx ; CHECK-NEXT: addq %rbx, %r14 ; CHECK-NEXT: leaq (%r8,%r10), %rbx ; CHECK-NEXT: addq %rdx, %rbx ; CHECK-NEXT: addq %r9, %rax +; CHECK-NEXT: addq %r10, %r15 ; CHECK-NEXT: addq %rbx, %rax ; CHECK-NEXT: leaq (%r11,%rdi), %rbx ; CHECK-NEXT: leaq (%r14,%rbx), %r9 -; CHECK-NEXT: addq %rbx, %r9 -; CHECK-NEXT: movq {{.*}}(%rip), %rbx ; CHECK-NEXT: addq %rdx, %rax +; CHECK-NEXT: addq %rbx, %r9 ; CHECK-NEXT: addq %rax, %rsi ; CHECK-NEXT: addq %r14, %r9 +; CHECK-NEXT: leaq (%rsi,%rdx), %rbx ; CHECK-NEXT: addq %rax, %r9 -; CHECK-NEXT: bswapq %rbx ; CHECK-NEXT: leaq (%rdx,%r8), %rax ; CHECK-NEXT: addq %rsi, %rax -; CHECK-NEXT: addq %r10, %rbx -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %r15 ; CHECK-NEXT: leaq (%r14,%r11), %rax ; CHECK-NEXT: leaq (%r9,%rax), %r10 +; CHECK-NEXT: addq %rsi, %r15 ; CHECK-NEXT: addq %rax, %r10 ; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: addq %rsi, %rbx -; CHECK-NEXT: addq %rbx, %rdi +; CHECK-NEXT: movq {{.*}}(%rip), %rcx +; CHECK-NEXT: addq %r15, %rdi ; CHECK-NEXT: addq %r9, %r10 -; CHECK-NEXT: bswapq %rax -; CHECK-NEXT: addq %rbx, %r10 -; CHECK-NEXT: leaq (%rsi,%rdx), %rbx ; CHECK-NEXT: addq %rdi, %rbx +; CHECK-NEXT: addq %r15, %r10 +; CHECK-NEXT: bswapq %rax +; CHECK-NEXT: bswapq %rcx ; CHECK-NEXT: addq %r8, %rax +; CHECK-NEXT: addq %rdx, %rcx ; CHECK-NEXT: addq %rbx, %rax ; CHECK-NEXT: leaq (%r9,%r14), %rbx ; CHECK-NEXT: leaq (%r10,%rbx), %r8 -; CHECK-NEXT: addq %rbx, %r8 -; CHECK-NEXT: movq {{.*}}(%rip), %rbx ; CHECK-NEXT: addq %rdi, %rax +; CHECK-NEXT: addq %rbx, %r8 ; CHECK-NEXT: addq %rax, %r11 ; CHECK-NEXT: addq %r10, %r8 +; CHECK-NEXT: leaq (%r11,%rdi), %rbx ; CHECK-NEXT: addq %rax, %r8 -; CHECK-NEXT: bswapq %rbx ; CHECK-NEXT: leaq (%rdi,%rsi), %rax ; CHECK-NEXT: addq %r11, %rax -; CHECK-NEXT: addq %rdx, %rbx -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: leaq (%r10,%r9), %rax ; CHECK-NEXT: leaq (%r8,%rax), %r15 +; CHECK-NEXT: addq %r11, %rcx ; CHECK-NEXT: addq %rax, %r15 ; CHECK-NEXT: movq {{.*}}(%rip), %rax -; CHECK-NEXT: addq %r11, %rbx -; CHECK-NEXT: addq %rbx, %r14 +; CHECK-NEXT: movq {{.*}}(%rip), %rdx +; CHECK-NEXT: addq %rcx, %r14 ; CHECK-NEXT: addq %r8, %r15 -; CHECK-NEXT: bswapq %rax -; CHECK-NEXT: addq %rbx, %r15 -; CHECK-NEXT: leaq (%r11,%rdi), %rbx ; CHECK-NEXT: addq %r14, %rbx +; CHECK-NEXT: addq %rcx, %r15 +; CHECK-NEXT: movq {{.*}}(%rip), %rcx +; CHECK-NEXT: bswapq %rax +; CHECK-NEXT: bswapq %rdx ; CHECK-NEXT: addq %rsi, %rax +; CHECK-NEXT: addq %rdi, %rdx +; CHECK-NEXT: bswapq %rcx ; CHECK-NEXT: addq %rbx, %rax ; CHECK-NEXT: leaq (%r8,%r10), %rbx ; CHECK-NEXT: leaq (%r15,%rbx), %rsi -; CHECK-NEXT: addq %rbx, %rsi -; CHECK-NEXT: movq {{.*}}(%rip), %rbx ; CHECK-NEXT: addq %r14, %rax +; CHECK-NEXT: addq %r11, %rcx +; CHECK-NEXT: addq %rbx, %rsi ; CHECK-NEXT: addq %rax, %r9 ; CHECK-NEXT: addq %r15, %rsi ; CHECK-NEXT: addq %rax, %rsi -; CHECK-NEXT: bswapq %rbx ; CHECK-NEXT: leaq (%r14,%r11), %rax ; CHECK-NEXT: addq %r9, %rax -; CHECK-NEXT: addq %rdi, %rbx -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: leaq (%rsi,%r15), %rbx +; CHECK-NEXT: addq %rax, %rdx ; CHECK-NEXT: leaq (%r15,%r8), %rax ; CHECK-NEXT: leaq (%rsi,%rax), %r12 +; CHECK-NEXT: addq %r9, %rdx ; CHECK-NEXT: addq %rax, %r12 -; CHECK-NEXT: movq {{.*}}(%rip), %rcx -; CHECK-NEXT: addq %r9, %rbx -; CHECK-NEXT: addq %rbx, %r10 -; CHECK-NEXT: addq %rsi, %r12 -; CHECK-NEXT: bswapq %rcx -; CHECK-NEXT: addq %rbx, %r12 +; CHECK-NEXT: addq %rdx, %r10 ; CHECK-NEXT: leaq (%r9,%r14), %rax +; CHECK-NEXT: addq %rsi, %r12 ; CHECK-NEXT: addq %r10, %rax -; CHECK-NEXT: addq %r11, %rcx +; CHECK-NEXT: addq %rdx, %r12 ; CHECK-NEXT: addq %rax, %rcx -; CHECK-NEXT: leaq (%rsi,%r15), %rbx ; CHECK-NEXT: leaq (%r12,%rbx), %rax +; CHECK-NEXT: addq %r10, %rcx +; CHECK-NEXT: leaq (%r12,%rsi), %rdx ; CHECK-NEXT: addq %rbx, %rax ; CHECK-NEXT: movq {{.*}}(%rip), %rbx -; CHECK-NEXT: addq %r10, %rcx +; CHECK-NEXT: movq {{.*}}(%rip), %rdi ; CHECK-NEXT: addq %rcx, %r8 ; CHECK-NEXT: addq %r12, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: bswapq %rbx ; CHECK-NEXT: leaq (%r10,%r9), %rcx ; CHECK-NEXT: addq %r8, %rcx +; CHECK-NEXT: bswapq %rbx +; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: addq %r14, %rbx +; CHECK-NEXT: addq %r9, %rdi +; CHECK-NEXT: leaq (%rax,%r12), %r9 ; CHECK-NEXT: addq %rcx, %rbx -; CHECK-NEXT: leaq (%r12,%rsi), %rdx ; CHECK-NEXT: leaq (%rax,%rdx), %rcx ; CHECK-NEXT: addq %rdx, %rcx -; CHECK-NEXT: movq {{.*}}(%rip), %rdx ; CHECK-NEXT: addq %r8, %rbx -; CHECK-NEXT: addq %rbx, %r15 ; CHECK-NEXT: addq %rax, %rcx -; CHECK-NEXT: bswapq %rdx +; CHECK-NEXT: addq %rbx, %r15 ; CHECK-NEXT: addq %rbx, %rcx ; CHECK-NEXT: leaq (%r8,%r10), %rbx +; CHECK-NEXT: leaq (%rcx,%r9), %r11 ; CHECK-NEXT: addq %r15, %rbx -; CHECK-NEXT: addq %r9, %rdx -; CHECK-NEXT: addq %rbx, %rdx -; CHECK-NEXT: leaq (%rax,%r12), %r9 -; CHECK-NEXT: leaq (%rcx,%r9), %rbx -; CHECK-NEXT: addq %r9, %rbx -; CHECK-NEXT: addq %r15, %rdx -; CHECK-NEXT: addq %rdx, %rsi -; CHECK-NEXT: addq %rcx, %rbx -; CHECK-NEXT: addq %rdx, %rbx -; CHECK-NEXT: movq {{.*}}(%rip), %rdx -; CHECK-NEXT: bswapq %rdx -; CHECK-NEXT: addq %r10, %rdx -; CHECK-NEXT: leaq (%r15,%r8), %rdi -; CHECK-NEXT: addq %rsi, %rdi -; CHECK-NEXT: addq %rdi, %rdx -; CHECK-NEXT: addq %rax, %rcx -; CHECK-NEXT: leaq (%rbx,%rcx), %rdi -; CHECK-NEXT: addq %rcx, %rdi +; CHECK-NEXT: addq %r9, %r11 +; CHECK-NEXT: movq {{.*}}(%rip), %r9 ; CHECK-NEXT: addq %rbx, %rdi -; CHECK-NEXT: addq %rsi, %rdx -; CHECK-NEXT: addq %rdx, %r12 -; CHECK-NEXT: addq %rdx, %rdi -; CHECK-NEXT: addq %r15, %rsi +; CHECK-NEXT: addq %rcx, %r11 +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: addq %r15, %rdi +; CHECK-NEXT: addq %rdi, %rsi +; CHECK-NEXT: addq %rdi, %r11 +; CHECK-NEXT: leaq (%r15,%r8), %rdi +; CHECK-NEXT: addq %rsi, %rdi +; CHECK-NEXT: bswapq %r9 ; CHECK-NEXT: bswapq %rax +; CHECK-NEXT: addq %r10, %r9 +; CHECK-NEXT: addq %rdi, %r9 +; CHECK-NEXT: leaq (%r11,%rcx), %rdi ; CHECK-NEXT: movq %rax, {{.*}}(%rip) ; CHECK-NEXT: addq %r8, %rax +; CHECK-NEXT: addq %rsi, %r9 +; CHECK-NEXT: addq %r15, %rsi +; CHECK-NEXT: addq %rcx, %rdi +; CHECK-NEXT: addq %r9, %r12 +; CHECK-NEXT: addq %r11, %rdi ; CHECK-NEXT: addq %r12, %rsi +; CHECK-NEXT: addq %r9, %rdi ; CHECK-NEXT: addq %rsi, %rax ; CHECK-NEXT: addq %r12, %rax ; CHECK-NEXT: addq %rdi, %rax diff --git a/llvm/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll b/llvm/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll --- a/llvm/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll +++ b/llvm/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll @@ -11,8 +11,8 @@ ; CHECK: movl 4([[REG]]), %edx ; CHECK: LBB0_1: ; CHECK: movl %eax, %ebx -; CHECK: addl $1, %ebx ; CHECK: movl %edx, %ecx +; CHECK: addl $1, %ebx ; CHECK: adcl $0, %ecx ; CHECK: lock cmpxchg8b ([[REG]]) ; CHECK-NEXT: jne diff --git a/llvm/test/CodeGen/X86/2011-05-09-loaduse.ll b/llvm/test/CodeGen/X86/2011-05-09-loaduse.ll --- a/llvm/test/CodeGen/X86/2011-05-09-loaduse.ll +++ b/llvm/test/CodeGen/X86/2011-05-09-loaduse.ll @@ -7,10 +7,10 @@ ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movaps (%eax), %xmm0 -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X86-NEXT: xorps %xmm1, %xmm1 +; X86-NEXT: movaps (%eax), %xmm0 ; X86-NEXT: movaps %xmm1, (%eax) +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X86-NEXT: movss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -19,9 +19,9 @@ ; X64-LABEL: test: ; X64: # %bb.0: # %entry ; X64-NEXT: movaps (%rdi), %xmm0 -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X64-NEXT: xorps %xmm1, %xmm1 ; X64-NEXT: movaps %xmm1, (%rdi) +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X64-NEXT: retq entry: %T = load <4 x float>, <4 x float>* %A diff --git a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll --- a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll +++ b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll @@ -20,9 +20,9 @@ ; CHECK-NEXT: movq {{.*}}(%rip), %rsi ; CHECK-NEXT: movq {{.*}}(%rip), %rax ; CHECK-NEXT: movq %rsi, %rdx -; CHECK-NEXT: shrq $8, %rdx ; CHECK-NEXT: movsbl %al, %ecx ; CHECK-NEXT: shrq $8, %rax +; CHECK-NEXT: shrq $8, %rdx ; CHECK-NEXT: cbtw ; CHECK-NEXT: idivb %dl ; CHECK-NEXT: movl %eax, %edx @@ -31,9 +31,9 @@ ; CHECK-NEXT: movzbl %dl, %ecx ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: pinsrb $1, %ecx, %xmm0 ; CHECK-NEXT: pextrw $0, %xmm0, {{.*}}(%rip) -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: retq entry: %0 = load <2 x i8>, <2 x i8>* @i, align 8 diff --git a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll --- a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll +++ b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll @@ -24,8 +24,8 @@ define void @complex_inreg_work(<2 x float> %a, <2 x float> %b, <2 x float> %c) { ; X32-LABEL: complex_inreg_work: ; X32: # %bb.0: # %entry -; X32-NEXT: movaps %xmm0, %xmm3 ; X32-NEXT: cmpordps %xmm2, %xmm2 +; X32-NEXT: movaps %xmm0, %xmm3 ; X32-NEXT: movaps %xmm2, %xmm0 ; X32-NEXT: blendvps %xmm0, %xmm3, %xmm1 ; X32-NEXT: movlps %xmm1, (%eax) @@ -33,8 +33,8 @@ ; ; X64-LABEL: complex_inreg_work: ; X64: # %bb.0: # %entry -; X64-NEXT: movaps %xmm0, %xmm3 ; X64-NEXT: cmpordps %xmm2, %xmm2 +; X64-NEXT: movaps %xmm0, %xmm3 ; X64-NEXT: movaps %xmm2, %xmm0 ; X64-NEXT: blendvps %xmm0, %xmm3, %xmm1 ; X64-NEXT: movlps %xmm1, (%rax) @@ -70,15 +70,15 @@ ; X32-NEXT: subl $60, %esp ; X32-NEXT: .cfi_def_cfa_offset 64 ; X32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; X32-NEXT: movaps {{.*#+}} xmm3 = <1.0E+0,1.0E+0,u,u> ; X32-NEXT: cvttps2dq %xmm2, %xmm0 ; X32-NEXT: cvtdq2ps %xmm0, %xmm1 ; X32-NEXT: xorps %xmm0, %xmm0 ; X32-NEXT: cmpltps %xmm2, %xmm0 -; X32-NEXT: movaps {{.*#+}} xmm3 = <1.0E+0,1.0E+0,u,u> ; X32-NEXT: addps %xmm1, %xmm3 ; X32-NEXT: movaps %xmm1, %xmm4 -; X32-NEXT: blendvps %xmm0, %xmm3, %xmm4 ; X32-NEXT: cmpeqps %xmm2, %xmm1 +; X32-NEXT: blendvps %xmm0, %xmm3, %xmm4 ; X32-NEXT: movaps %xmm1, %xmm0 ; X32-NEXT: blendvps %xmm0, %xmm2, %xmm4 ; X32-NEXT: movlps %xmm4, {{[0-9]+}}(%esp) @@ -90,15 +90,15 @@ ; X64-LABEL: full_test: ; X64: # %bb.0: # %entry ; X64-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; X64-NEXT: movaps {{.*#+}} xmm3 = <1.0E+0,1.0E+0,u,u> ; X64-NEXT: cvttps2dq %xmm2, %xmm0 ; X64-NEXT: cvtdq2ps %xmm0, %xmm1 ; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: cmpltps %xmm2, %xmm0 -; X64-NEXT: movaps {{.*#+}} xmm3 = <1.0E+0,1.0E+0,u,u> ; X64-NEXT: addps %xmm1, %xmm3 ; X64-NEXT: movaps %xmm1, %xmm4 -; X64-NEXT: blendvps %xmm0, %xmm3, %xmm4 ; X64-NEXT: cmpeqps %xmm2, %xmm1 +; X64-NEXT: blendvps %xmm0, %xmm3, %xmm4 ; X64-NEXT: movaps %xmm1, %xmm0 ; X64-NEXT: blendvps %xmm0, %xmm2, %xmm4 ; X64-NEXT: movlps %xmm4, -{{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/2011-10-21-widen-cmp.ll b/llvm/test/CodeGen/X86/2011-10-21-widen-cmp.ll --- a/llvm/test/CodeGen/X86/2011-10-21-widen-cmp.ll +++ b/llvm/test/CodeGen/X86/2011-10-21-widen-cmp.ll @@ -7,8 +7,8 @@ define void @cmp_2_floats(<2 x float> %a, <2 x float> %b, <2 x float> %c) { ; CHECK-LABEL: cmp_2_floats: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movaps %xmm0, %xmm3 ; CHECK-NEXT: cmpordps %xmm2, %xmm2 +; CHECK-NEXT: movaps %xmm0, %xmm3 ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: blendvps %xmm0, %xmm3, %xmm1 ; CHECK-NEXT: movlps %xmm1, (%rax) @@ -23,8 +23,8 @@ define void @cmp_2_doubles(<2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: cmp_2_doubles: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movapd %xmm0, %xmm3 ; CHECK-NEXT: cmpordpd %xmm2, %xmm2 +; CHECK-NEXT: movapd %xmm0, %xmm3 ; CHECK-NEXT: movapd %xmm2, %xmm0 ; CHECK-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; CHECK-NEXT: movapd %xmm1, (%rax) diff --git a/llvm/test/CodeGen/X86/2011-12-28-vselecti8.ll b/llvm/test/CodeGen/X86/2011-12-28-vselecti8.ll --- a/llvm/test/CodeGen/X86/2011-12-28-vselecti8.ll +++ b/llvm/test/CodeGen/X86/2011-12-28-vselecti8.ll @@ -19,9 +19,9 @@ ; CHECK-LABEL: foo8: ; CHECK: ## %bb.0: ## %allocas ; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; CHECK-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.0E+2,2.0E+0,1.0E+2,4.0E+0] ; CHECK-NEXT: movups %xmm1, (%rdi) +; CHECK-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: movups %xmm0, 16(%rdi) ; CHECK-NEXT: retq allocas: diff --git a/llvm/test/CodeGen/X86/2012-11-28-merge-store-alias.ll b/llvm/test/CodeGen/X86/2012-11-28-merge-store-alias.ll --- a/llvm/test/CodeGen/X86/2012-11-28-merge-store-alias.ll +++ b/llvm/test/CodeGen/X86/2012-11-28-merge-store-alias.ll @@ -4,7 +4,7 @@ ; CHECK: callq foo ; CHECK: xorps %xmm0, %xmm0 ; CHECK-NEXT: movups %xmm0 -; CHECK-NEXT: movl 36(%rsp), %ebp +; CHECK: movl 36(%rsp), %ebp ; CHECK: callq foo ; CHECK: ret declare i32 @foo([10 x i32]* ) diff --git a/llvm/test/CodeGen/X86/GlobalISel/add-vec.ll b/llvm/test/CodeGen/X86/GlobalISel/add-vec.ll --- a/llvm/test/CodeGen/X86/GlobalISel/add-vec.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/add-vec.ll @@ -55,8 +55,8 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq %ret = add <32 x i8> %arg1, %arg2 @@ -78,8 +78,8 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq %ret = add <16 x i16> %arg1, %arg2 @@ -101,8 +101,8 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq %ret = add <8 x i32> %arg1, %arg2 @@ -124,8 +124,8 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq %ret = add <4 x i64> %arg1, %arg2 @@ -149,11 +149,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpaddb %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 -; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpaddb %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: retq @@ -178,11 +178,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpaddw %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 -; AVX1-NEXT: vpaddw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 ; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpaddw %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: retq @@ -207,11 +207,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 -; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpaddd %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: retq @@ -236,11 +236,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 -; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpaddq %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/atom-sched.ll b/llvm/test/CodeGen/X86/atom-sched.ll --- a/llvm/test/CodeGen/X86/atom-sched.ll +++ b/llvm/test/CodeGen/X86/atom-sched.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -O2 -mcpu=atom -mtriple=i686-- -relocation-model=static | FileCheck -check-prefix=atom %s ; RUN: llc < %s -O2 -mcpu=slm -mtriple=i686-- -relocation-model=static | FileCheck -check-prefix=slm %s ; RUN: llc < %s -O2 -mcpu=goldmont -mtriple=i686-- -relocation-model=static | FileCheck -check-prefix=slm %s @@ -12,15 +13,35 @@ @f = common global i32 0, align 4 define void @func() nounwind uwtable { -; atom: imull -; atom-NOT: movl -; atom: imull -; slm: imull -; slm-NOT: movl -; slm: imull -; CHECK: imull -; CHECK: movl -; CHECK: imull +; atom-LABEL: func: +; atom: # %bb.0: # %entry +; atom-NEXT: movl b, %eax +; atom-NEXT: movl e, %ecx +; atom-NEXT: imull c, %eax +; atom-NEXT: imull f, %ecx +; atom-NEXT: movl %eax, a +; atom-NEXT: movl %ecx, d +; atom-NEXT: retl +; +; slm-LABEL: func: +; slm: # %bb.0: # %entry +; slm-NEXT: movl b, %eax +; slm-NEXT: movl e, %ecx +; slm-NEXT: imull c, %eax +; slm-NEXT: imull f, %ecx +; slm-NEXT: movl %eax, a +; slm-NEXT: movl %ecx, d +; slm-NEXT: retl +; +; CHECK-LABEL: func: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl b, %eax +; CHECK-NEXT: movl e, %ecx +; CHECK-NEXT: imull c, %eax +; CHECK-NEXT: imull f, %ecx +; CHECK-NEXT: movl %eax, a +; CHECK-NEXT: movl %ecx, d +; CHECK-NEXT: retl entry: %0 = load i32, i32* @b, align 4 %1 = load i32, i32* @c, align 4 diff --git a/llvm/test/CodeGen/X86/atomic-load-store-wide.ll b/llvm/test/CodeGen/X86/atomic-load-store-wide.ll --- a/llvm/test/CodeGen/X86/atomic-load-store-wide.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store-wide.ll @@ -8,8 +8,8 @@ define void @test1(i64* %ptr, i64 %val1) { ; SSE42-LABEL: test1: ; SSE42: # %bb.0: -; SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; SSE42-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; SSE42-NEXT: movlps %xmm0, (%eax) ; SSE42-NEXT: lock orl $0, (%esp) ; SSE42-NEXT: retl @@ -44,8 +44,8 @@ ; SSE42: # %bb.0: ; SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE42-NEXT: movd %xmm0, %eax ; SSE42-NEXT: pextrd $1, %xmm0, %edx +; SSE42-NEXT: movd %xmm0, %eax ; SSE42-NEXT: retl ; ; NOSSE-LABEL: test2: @@ -100,8 +100,8 @@ ; SSE42: # %bb.0: ; SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE42-NEXT: movd %xmm0, %eax ; SSE42-NEXT: pextrd $1, %xmm0, %edx +; SSE42-NEXT: movd %xmm0, %eax ; SSE42-NEXT: retl ; ; NOSSE-LABEL: test4: diff --git a/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll b/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll --- a/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll +++ b/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll @@ -14,15 +14,15 @@ ; CHECK-LABEL: test_overlap_1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl $7, -8(%rdi) +; CHECK-NEXT: movslq %esi, %rcx ; CHECK-NEXT: movq -16(%rdi), %rax ; CHECK-NEXT: movq %rax, (%rdi) ; CHECK-NEXT: movl -8(%rdi), %eax ; CHECK-NEXT: movl %eax, 8(%rdi) ; CHECK-NEXT: movl -4(%rdi), %eax ; CHECK-NEXT: movl %eax, 12(%rdi) -; CHECK-NEXT: movslq %esi, %rax -; CHECK-NEXT: movq %rax, -9(%rdi) -; CHECK-NEXT: movq %rax, -16(%rdi) +; CHECK-NEXT: movq %rcx, -9(%rdi) +; CHECK-NEXT: movq %rcx, -16(%rdi) ; CHECK-NEXT: movb $0, -1(%rdi) ; CHECK-NEXT: movq -16(%rdi), %rax ; CHECK-NEXT: movq %rax, 16(%rdi) @@ -39,9 +39,9 @@ ; DISABLED-LABEL: test_overlap_1: ; DISABLED: # %bb.0: # %entry ; DISABLED-NEXT: movl $7, -8(%rdi) +; DISABLED-NEXT: movslq %esi, %rax ; DISABLED-NEXT: movups -16(%rdi), %xmm0 ; DISABLED-NEXT: movups %xmm0, (%rdi) -; DISABLED-NEXT: movslq %esi, %rax ; DISABLED-NEXT: movq %rax, -9(%rdi) ; DISABLED-NEXT: movq %rax, -16(%rdi) ; DISABLED-NEXT: movb $0, -1(%rdi) @@ -209,6 +209,7 @@ ; CHECK-LABEL: test_overlap_3: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl $7, -10(%rdi) +; CHECK-NEXT: movslq %esi, %rcx ; CHECK-NEXT: movl -16(%rdi), %eax ; CHECK-NEXT: movl %eax, (%rdi) ; CHECK-NEXT: movzwl -12(%rdi), %eax @@ -219,9 +220,8 @@ ; CHECK-NEXT: movl %eax, 10(%rdi) ; CHECK-NEXT: movzwl -2(%rdi), %eax ; CHECK-NEXT: movw %ax, 14(%rdi) -; CHECK-NEXT: movslq %esi, %rax -; CHECK-NEXT: movq %rax, -9(%rdi) -; CHECK-NEXT: movq %rax, -16(%rdi) +; CHECK-NEXT: movq %rcx, -9(%rdi) +; CHECK-NEXT: movq %rcx, -16(%rdi) ; CHECK-NEXT: movb $0, -1(%rdi) ; CHECK-NEXT: movq -16(%rdi), %rax ; CHECK-NEXT: movq %rax, 16(%rdi) @@ -238,9 +238,9 @@ ; DISABLED-LABEL: test_overlap_3: ; DISABLED: # %bb.0: # %entry ; DISABLED-NEXT: movl $7, -10(%rdi) +; DISABLED-NEXT: movslq %esi, %rax ; DISABLED-NEXT: movups -16(%rdi), %xmm0 ; DISABLED-NEXT: movups %xmm0, (%rdi) -; DISABLED-NEXT: movslq %esi, %rax ; DISABLED-NEXT: movq %rax, -9(%rdi) ; DISABLED-NEXT: movq %rax, -16(%rdi) ; DISABLED-NEXT: movb $0, -1(%rdi) @@ -329,8 +329,8 @@ ; CHECK-LABEL: test_overlap_4: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movups -16(%rdi), %xmm0 -; CHECK-NEXT: movups %xmm0, (%rdi) ; CHECK-NEXT: movslq %esi, %rax +; CHECK-NEXT: movups %xmm0, (%rdi) ; CHECK-NEXT: movq %rax, -8(%rdi) ; CHECK-NEXT: movl %eax, -16(%rdi) ; CHECK-NEXT: movl $0, -11(%rdi) @@ -351,8 +351,8 @@ ; DISABLED-LABEL: test_overlap_4: ; DISABLED: # %bb.0: # %entry ; DISABLED-NEXT: movups -16(%rdi), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%rdi) ; DISABLED-NEXT: movslq %esi, %rax +; DISABLED-NEXT: movups %xmm0, (%rdi) ; DISABLED-NEXT: movq %rax, -8(%rdi) ; DISABLED-NEXT: movl %eax, -16(%rdi) ; DISABLED-NEXT: movl $0, -11(%rdi) @@ -425,8 +425,8 @@ ; CHECK-LABEL: test_overlap_5: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movups -16(%rdi), %xmm0 -; CHECK-NEXT: movups %xmm0, (%rdi) ; CHECK-NEXT: movslq %esi, %rax +; CHECK-NEXT: movups %xmm0, (%rdi) ; CHECK-NEXT: movq %rax, -16(%rdi) ; CHECK-NEXT: movb %al, -14(%rdi) ; CHECK-NEXT: movb $0, -11(%rdi) @@ -447,8 +447,8 @@ ; DISABLED-LABEL: test_overlap_5: ; DISABLED: # %bb.0: # %entry ; DISABLED-NEXT: movups -16(%rdi), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%rdi) ; DISABLED-NEXT: movslq %esi, %rax +; DISABLED-NEXT: movups %xmm0, (%rdi) ; DISABLED-NEXT: movq %rax, -16(%rdi) ; DISABLED-NEXT: movb %al, -14(%rdi) ; DISABLED-NEXT: movb $0, -11(%rdi) diff --git a/llvm/test/CodeGen/X86/avoid-sfb.ll b/llvm/test/CodeGen/X86/avoid-sfb.ll --- a/llvm/test/CodeGen/X86/avoid-sfb.ll +++ b/llvm/test/CodeGen/X86/avoid-sfb.ll @@ -19,13 +19,13 @@ ; CHECK-NEXT: movl %edx, 4(%rdi) ; CHECK-NEXT: .LBB0_2: # %if.end ; CHECK-NEXT: movups (%r8), %xmm0 -; CHECK-NEXT: movups %xmm0, (%rcx) ; CHECK-NEXT: movl (%rdi), %eax +; CHECK-NEXT: movl 4(%rdi), %edx ; CHECK-NEXT: movl %eax, (%rsi) -; CHECK-NEXT: movl 4(%rdi), %eax -; CHECK-NEXT: movl %eax, 4(%rsi) -; CHECK-NEXT: movq 8(%rdi), %rax -; CHECK-NEXT: movq %rax, 8(%rsi) +; CHECK-NEXT: movl %edx, 4(%rsi) +; CHECK-NEXT: movups %xmm0, (%rcx) +; CHECK-NEXT: movq 8(%rdi), %rcx +; CHECK-NEXT: movq %rcx, 8(%rsi) ; CHECK-NEXT: retq ; ; DISABLED-LABEL: test_conditional_block: @@ -36,9 +36,9 @@ ; DISABLED-NEXT: movl %edx, 4(%rdi) ; DISABLED-NEXT: .LBB0_2: # %if.end ; DISABLED-NEXT: movups (%r8), %xmm0 +; DISABLED-NEXT: movups (%rdi), %xmm1 ; DISABLED-NEXT: movups %xmm0, (%rcx) -; DISABLED-NEXT: movups (%rdi), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%rsi) +; DISABLED-NEXT: movups %xmm1, (%rsi) ; DISABLED-NEXT: retq ; ; CHECK-AVX2-LABEL: test_conditional_block: @@ -727,37 +727,37 @@ define void @test_stack(%struct.S6* noalias nocapture sret %agg.result, %struct.S6* byval nocapture readnone align 8 %s1, %struct.S6* byval nocapture align 8 %s2, i32 %x) local_unnamed_addr #0 { ; CHECK-LABEL: test_stack: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movl %esi, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: movups %xmm0, (%rdi) -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: movq %rcx, 16(%rdi) -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movl %ecx, 24(%rdi) -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movl %ecx, 28(%rdi) +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r8d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %esi -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movl %edx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, 24(%rdi) +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movq %rcx, 16(%rdi) +; CHECK-NEXT: movl %r8d, 28(%rdi) ; CHECK-NEXT: movl %esi, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, (%rdi) +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rdi, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: retq ; ; DISABLED-LABEL: test_stack: ; DISABLED: # %bb.0: # %entry -; DISABLED-NEXT: movq %rdi, %rax ; DISABLED-NEXT: movl %esi, {{[0-9]+}}(%rsp) +; DISABLED-NEXT: movq %rdi, %rax +; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%rdi) -; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; DISABLED-NEXT: movups %xmm0, 16(%rdi) -; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; DISABLED-NEXT: movups %xmm1, 16(%rdi) ; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; DISABLED-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; DISABLED-NEXT: movups %xmm0, (%rdi) +; DISABLED-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) ; DISABLED-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; DISABLED-NEXT: retq ; @@ -845,8 +845,8 @@ ; CHECK-NEXT: cmpl $18, %ebp ; CHECK-NEXT: jl .LBB9_2 ; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movl %ebp, 4(%rbx) ; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: movl %ebp, 4(%rbx) ; CHECK-NEXT: callq bar ; CHECK-NEXT: .LBB9_2: # %if.end ; CHECK-NEXT: movups (%r15), %xmm0 @@ -892,8 +892,8 @@ ; DISABLED-NEXT: cmpl $18, %ebp ; DISABLED-NEXT: jl .LBB9_2 ; DISABLED-NEXT: # %bb.1: # %if.then -; DISABLED-NEXT: movl %ebp, 4(%rbx) ; DISABLED-NEXT: movq %rbx, %rdi +; DISABLED-NEXT: movl %ebp, 4(%rbx) ; DISABLED-NEXT: callq bar ; DISABLED-NEXT: .LBB9_2: # %if.end ; DISABLED-NEXT: movups (%r15), %xmm0 @@ -1054,8 +1054,8 @@ ; CHECK-NEXT: cmpl $18, %edx ; CHECK-NEXT: jl .LBB10_2 ; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movl %edx, 4(%rbx) ; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: movl %edx, 4(%rbx) ; CHECK-NEXT: callq bar ; CHECK-NEXT: .LBB10_2: # %if.end ; CHECK-NEXT: movups (%r12), %xmm0 @@ -1102,8 +1102,8 @@ ; DISABLED-NEXT: cmpl $18, %edx ; DISABLED-NEXT: jl .LBB10_2 ; DISABLED-NEXT: # %bb.1: # %if.then -; DISABLED-NEXT: movl %edx, 4(%rbx) ; DISABLED-NEXT: movq %rbx, %rdi +; DISABLED-NEXT: movl %edx, 4(%rbx) ; DISABLED-NEXT: callq bar ; DISABLED-NEXT: .LBB10_2: # %if.end ; DISABLED-NEXT: movups (%r15), %xmm0 @@ -1258,14 +1258,14 @@ ; CHECK-NEXT: movups 16(%r8), %xmm1 ; CHECK-NEXT: movups %xmm1, 16(%rcx) ; CHECK-NEXT: movups %xmm0, (%rcx) +; CHECK-NEXT: movups 16(%rdi), %xmm0 ; CHECK-NEXT: movl (%rdi), %eax ; CHECK-NEXT: movl 4(%rdi), %ecx ; CHECK-NEXT: movq 8(%rdi), %rdx -; CHECK-NEXT: movups 16(%rdi), %xmm0 -; CHECK-NEXT: movups %xmm0, 16(%rsi) ; CHECK-NEXT: movl %eax, (%rsi) ; CHECK-NEXT: movl %ecx, 4(%rsi) ; CHECK-NEXT: movq %rdx, 8(%rsi) +; CHECK-NEXT: movups %xmm0, 16(%rsi) ; CHECK-NEXT: retq ; ; DISABLED-LABEL: test_conditional_block_float: @@ -1358,12 +1358,12 @@ ; CHECK-NEXT: movups 16(%r8), %xmm1 ; CHECK-NEXT: movups %xmm1, 16(%rcx) ; CHECK-NEXT: movups %xmm0, (%rcx) +; CHECK-NEXT: movups 16(%rdi), %xmm0 ; CHECK-NEXT: movq (%rdi), %rax ; CHECK-NEXT: movq 8(%rdi), %rcx -; CHECK-NEXT: movups 16(%rdi), %xmm0 -; CHECK-NEXT: movups %xmm0, 16(%rsi) ; CHECK-NEXT: movq %rax, (%rsi) ; CHECK-NEXT: movq %rcx, 8(%rsi) +; CHECK-NEXT: movups %xmm0, 16(%rsi) ; CHECK-NEXT: retq ; ; DISABLED-LABEL: test_conditional_block_ymm: diff --git a/llvm/test/CodeGen/X86/avx-arith.ll b/llvm/test/CodeGen/X86/avx-arith.ll --- a/llvm/test/CodeGen/X86/avx-arith.ll +++ b/llvm/test/CodeGen/X86/avx-arith.ll @@ -191,10 +191,10 @@ define <4 x i64> @vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone { ; CHECK-LABEL: vpaddq: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %x = add <4 x i64> %i, %j @@ -204,10 +204,10 @@ define <8 x i32> @vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone { ; CHECK-LABEL: vpaddd: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %x = add <8 x i32> %i, %j @@ -217,10 +217,10 @@ define <16 x i16> @vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone { ; CHECK-LABEL: vpaddw: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddw %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %x = add <16 x i16> %i, %j @@ -230,10 +230,10 @@ define <32 x i8> @vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone { ; CHECK-LABEL: vpaddb: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %x = add <32 x i8> %i, %j @@ -243,10 +243,10 @@ define <4 x i64> @vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone { ; CHECK-LABEL: vpsubq: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpsubq %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %x = sub <4 x i64> %i, %j @@ -256,10 +256,10 @@ define <8 x i32> @vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone { ; CHECK-LABEL: vpsubd: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpsubd %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %x = sub <8 x i32> %i, %j @@ -269,10 +269,10 @@ define <16 x i16> @vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone { ; CHECK-LABEL: vpsubw: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpsubw %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpsubw %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %x = sub <16 x i16> %i, %j @@ -282,10 +282,10 @@ define <32 x i8> @vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone { ; CHECK-LABEL: vpsubb: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %x = sub <32 x i8> %i, %j @@ -295,10 +295,10 @@ define <8 x i32> @vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone { ; CHECK-LABEL: vpmulld: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpmulld %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpmulld %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %x = mul <8 x i32> %i, %j @@ -308,10 +308,10 @@ define <16 x i16> @vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone { ; CHECK-LABEL: vpmullw: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpmullw %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %x = mul <16 x i16> %i, %j @@ -324,20 +324,20 @@ ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 ; CHECK-NEXT: vpsrlq $32, %xmm3, %xmm4 -; CHECK-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 ; CHECK-NEXT: vpsrlq $32, %xmm2, %xmm5 +; CHECK-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 ; CHECK-NEXT: vpmuludq %xmm5, %xmm3, %xmm5 -; CHECK-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vpsllq $32, %xmm4, %xmm4 ; CHECK-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vpaddq %xmm4, %xmm2, %xmm2 ; CHECK-NEXT: vpsrlq $32, %xmm0, %xmm3 ; CHECK-NEXT: vpmuludq %xmm1, %xmm3, %xmm3 +; CHECK-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpsllq $32, %xmm4, %xmm4 +; CHECK-NEXT: vpaddq %xmm4, %xmm2, %xmm2 ; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm4 ; CHECK-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 +; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpaddq %xmm3, %xmm4, %xmm3 ; CHECK-NEXT: vpsllq $32, %xmm3, %xmm3 -; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx-win64-args.ll b/llvm/test/CodeGen/X86/avx-win64-args.ll --- a/llvm/test/CodeGen/X86/avx-win64-args.ll +++ b/llvm/test/CodeGen/X86/avx-win64-args.ll @@ -6,8 +6,8 @@ define <8 x float> @test1(<8 x float> %x, <8 x float> %y) nounwind uwtable readnone ssp { entry: ; CHECK: test1 -; CHECK: leaq {{.*}}, %rcx ; CHECK: movl {{.*}}, %edx +; CHECK: leaq {{.*}}, %rcx ; CHECK: call ; CHECK: ret %x1 = fadd <8 x float> %x, %y diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll --- a/llvm/test/CodeGen/X86/avx.ll +++ b/llvm/test/CodeGen/X86/avx.ll @@ -56,8 +56,8 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { ; CHECK-LABEL: insertps_from_vector_load_offset_2: ; On X32, account for the argument's move to registers -; X32: movl 4(%esp), %eax ; X32: movl 8(%esp), %ecx +; X32: movl 4(%esp), %eax ; CHECK-NOT: mov ;; Try to match a bit more of the instr, since we need the load's offset. ; CHECK: vinsertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] @@ -113,8 +113,8 @@ ; CHECK-NOT: mov ; CHECK: insertps $48 ; CHECK: insertps $48 -; CHECK: vaddps ; CHECK: insertps $48 +; CHECK: vaddps ; CHECK: insertps $48 ; CHECK: vaddps ; CHECK: vaddps diff --git a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll --- a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll +++ b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll @@ -6,8 +6,8 @@ define void @test1(float* %A, float* %C) #0 { ; X86-LABEL: test1: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: vandps LCPI0_0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) @@ -33,8 +33,8 @@ define void @test2(float* %A, float* %C) #0 { ; X86-LABEL: test2: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovaps (%ecx), %xmm0 ; X86-NEXT: vorps LCPI1_0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) @@ -60,8 +60,8 @@ define void @test3(float* %A, float* %C) #0 { ; X86-LABEL: test3: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovaps (%ecx), %xmm0 ; X86-NEXT: vxorps LCPI2_0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) @@ -86,8 +86,8 @@ define void @test4(float* %A, float* %C) #0 { ; X86-LABEL: test4: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovaps (%ecx), %xmm0 ; X86-NEXT: vandnps LCPI3_0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) diff --git a/llvm/test/CodeGen/X86/break-false-dep.ll b/llvm/test/CodeGen/X86/break-false-dep.ll --- a/llvm/test/CodeGen/X86/break-false-dep.ll +++ b/llvm/test/CodeGen/X86/break-false-dep.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx -mcpu=corei7-avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512vl -mcpu=skx | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx -mcpu=corei7-avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512vl -mcpu=skx | FileCheck %s --check-prefixes=AVX,AVX512 define double @t1(float* nocapture %x) nounwind readonly ssp { entry: @@ -190,14 +190,15 @@ ;SSE-LABEL:@loopdep3 ;SSE: xorps [[XMM0:%xmm[0-9]+]], [[XMM0]] ;SSE-NEXT: cvtsi2sdl {{.*}}, [[XMM0]] -;SSE-NEXT: mulsd {{.*}}, [[XMM0]] +;SSE: mulsd {{.*}}, [[XMM0]] ;SSE-NEXT: mulsd {{.*}}, [[XMM0]] ;SSE-NEXT: mulsd {{.*}}, [[XMM0]] ;SSE-NEXT: movsd [[XMM0]], + ;AVX-LABEL:@loopdep3 ;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]] ;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], {{%xmm[0-9]+}} -;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] +;AVX: vmulsd {{.*}}, [[XMM0]], [[XMM0]] ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] ;AVX-NEXT: vmovsd [[XMM0]], @@ -255,9 +256,13 @@ tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() %tmp1 = sitofp i64 %arg to double ret double %tmp1 -;AVX-LABEL:@clearence -;AVX: vxorps [[XMM6:%xmm6]], [[XMM6]], [[XMM6]] -;AVX-NEXT: vcvtsi2sd {{.*}}, [[XMM6]], {{%xmm[0-9]+}} +;AVX1-LABEL:@clearence +;AVX1: vxorps [[XMM0:%xmm0]], [[XMM0]], [[XMM0]] +;AVX1-NEXT: vcvtsi2sd {{.*}}, [[XMM0]], {{%xmm[0-9]+}} + +;AVX512-LABEL:@clearence +;AVX512: vxorps [[XMM6:%xmm6]], [[XMM6]], [[XMM6]] +;AVX512-NEXT: vcvtsi2sd {{.*}}, [[XMM6]], {{%xmm[0-9]+}} } ; Make sure we are making a smart choice regarding undef registers in order to diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll --- a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll +++ b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll @@ -65,8 +65,8 @@ ; ; X64-LABEL: div64: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq %rdi, %rcx +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: orq %rsi, %rcx ; X64-NEXT: shrq $32, %rcx ; X64-NEXT: je .LBB1_1 @@ -188,8 +188,8 @@ ; ; X64-LABEL: div64_hugews: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq %rdi, %rcx +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: orq %rsi, %rcx ; X64-NEXT: shrq $32, %rcx ; X64-NEXT: je .LBB4_1 diff --git a/llvm/test/CodeGen/X86/byval7.ll b/llvm/test/CodeGen/X86/byval7.ll --- a/llvm/test/CodeGen/X86/byval7.ll +++ b/llvm/test/CodeGen/X86/byval7.ll @@ -15,10 +15,10 @@ ; CHECK-NEXT: andl $-16, %esp ; CHECK-NEXT: subl $304, %esp # imm = 0x130 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [3,2,1,0] -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; CHECK-NEXT: leal {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: leal {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: movl $36, %ecx +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; CHECK-NEXT: rep;movsl (%esi), %es:(%edi) ; CHECK-NEXT: movl $1, (%esp) ; CHECK-NEXT: calll t diff --git a/llvm/test/CodeGen/X86/catchpad-regmask.ll b/llvm/test/CodeGen/X86/catchpad-regmask.ll --- a/llvm/test/CodeGen/X86/catchpad-regmask.ll +++ b/llvm/test/CodeGen/X86/catchpad-regmask.ll @@ -77,8 +77,8 @@ ; CHECK: "?catch${{[0-9]+}}@?0?global_array@4HA": ; CHECK: pushq %rbp -; CHECK: movslq {{.*}}, %[[idx:[^ ]*]] ; CHECK: leaq array(%rip), %[[base:[^ ]*]] +; CHECK: movslq {{.*}}, %[[idx:[^ ]*]] ; CHECK: movl $222, (%[[base]],%[[idx]],4) ; CHECK: popq %rbp ; CHECK: retq # CATCHRET diff --git a/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll b/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll --- a/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll +++ b/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll @@ -14,9 +14,9 @@ ret void ; X64-LABEL: func_cf_vector_x64 - ; X64: movq %rcx, %rax ; X64: movups (%rdx), %xmm0 ; X64: movups 16(%rdx), %xmm1 + ; X64: movq %rcx, %rax ; X64: movaps %xmm0, 32(%rsp) ; X64: movaps %xmm1, 48(%rsp) ; X64: movsd 32(%rsp), %xmm0 # xmm0 = mem[0],zero diff --git a/llvm/test/CodeGen/X86/cfguard-x86-vectorcall.ll b/llvm/test/CodeGen/X86/cfguard-x86-vectorcall.ll --- a/llvm/test/CodeGen/X86/cfguard-x86-vectorcall.ll +++ b/llvm/test/CodeGen/X86/cfguard-x86-vectorcall.ll @@ -11,8 +11,8 @@ ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $48, %esp -; X32-NEXT: movl 8(%ebp), %ecx ; X32-NEXT: movl 12(%ebp), %eax +; X32-NEXT: movl 8(%ebp), %ecx ; X32-NEXT: movups (%eax), %xmm0 ; X32-NEXT: movups 16(%eax), %xmm1 ; X32-NEXT: movaps %xmm0, (%esp) diff --git a/llvm/test/CodeGen/X86/cmov-fp.ll b/llvm/test/CodeGen/X86/cmov-fp.ll --- a/llvm/test/CodeGen/X86/cmov-fp.ll +++ b/llvm/test/CodeGen/X86/cmov-fp.ll @@ -1057,9 +1057,9 @@ ; SSE-LABEL: test17: ; SSE: # %bb.0: ; SSE-NEXT: fldt {{[0-9]+}}(%esp) +; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: fxch %st(1) ; SSE-NEXT: fcmovnbe %st(1), %st ; SSE-NEXT: fstp %st(1) @@ -1110,9 +1110,9 @@ ; SSE-LABEL: test18: ; SSE: # %bb.0: ; SSE-NEXT: fldt {{[0-9]+}}(%esp) +; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: fxch %st(1) ; SSE-NEXT: fcmovnb %st(1), %st ; SSE-NEXT: fstp %st(1) @@ -1163,9 +1163,9 @@ ; SSE-LABEL: test19: ; SSE: # %bb.0: ; SSE-NEXT: fldt {{[0-9]+}}(%esp) +; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: fxch %st(1) ; SSE-NEXT: fcmovb %st(1), %st ; SSE-NEXT: fstp %st(1) @@ -1216,9 +1216,9 @@ ; SSE-LABEL: test20: ; SSE: # %bb.0: ; SSE-NEXT: fldt {{[0-9]+}}(%esp) +; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: fxch %st(1) ; SSE-NEXT: fcmovbe %st(1), %st ; SSE-NEXT: fstp %st(1) @@ -1269,12 +1269,12 @@ ; SSE-LABEL: test21: ; SSE: # %bb.0: ; SSE-NEXT: fldt {{[0-9]+}}(%esp) +; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: fxch %st(1) ; SSE-NEXT: setg %al ; SSE-NEXT: testb %al, %al -; SSE-NEXT: flds {{\.LCPI.*}} -; SSE-NEXT: fxch %st(1) ; SSE-NEXT: fcmovne %st(1), %st ; SSE-NEXT: fstp %st(1) ; SSE-NEXT: retl @@ -1329,12 +1329,12 @@ ; SSE-LABEL: test22: ; SSE: # %bb.0: ; SSE-NEXT: fldt {{[0-9]+}}(%esp) +; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: fxch %st(1) ; SSE-NEXT: setge %al ; SSE-NEXT: testb %al, %al -; SSE-NEXT: flds {{\.LCPI.*}} -; SSE-NEXT: fxch %st(1) ; SSE-NEXT: fcmovne %st(1), %st ; SSE-NEXT: fstp %st(1) ; SSE-NEXT: retl @@ -1388,12 +1388,12 @@ ; SSE-LABEL: test23: ; SSE: # %bb.0: ; SSE-NEXT: fldt {{[0-9]+}}(%esp) +; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: fxch %st(1) ; SSE-NEXT: setl %al ; SSE-NEXT: testb %al, %al -; SSE-NEXT: flds {{\.LCPI.*}} -; SSE-NEXT: fxch %st(1) ; SSE-NEXT: fcmovne %st(1), %st ; SSE-NEXT: fstp %st(1) ; SSE-NEXT: retl @@ -1447,12 +1447,12 @@ ; SSE-LABEL: test24: ; SSE: # %bb.0: ; SSE-NEXT: fldt {{[0-9]+}}(%esp) +; SSE-NEXT: flds {{\.LCPI.*}} ; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; SSE-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: fxch %st(1) ; SSE-NEXT: setle %al ; SSE-NEXT: testb %al, %al -; SSE-NEXT: flds {{\.LCPI.*}} -; SSE-NEXT: fxch %st(1) ; SSE-NEXT: fcmovne %st(1), %st ; SSE-NEXT: fstp %st(1) ; SSE-NEXT: retl diff --git a/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll --- a/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll +++ b/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll @@ -4,9 +4,9 @@ ; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=64-ALL,64-GOOD-RA ; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefixes=64-ALL,64-FAST-RA -; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mattr=+sahf %s -o - | FileCheck %s --check-prefixes=64-ALL,64-GOOD-RA-SAHF +; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mattr=+sahf %s -o - | FileCheck %s --check-prefixes=64-ALL,64-GOOD-RA-SAHF,64-GOOD-RA-SAHF-GENERIC ; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefixes=64-ALL,64-FAST-RA-SAHF -; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mcpu=corei7 %s -o - | FileCheck %s --check-prefixes=64-ALL,64-GOOD-RA-SAHF +; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mcpu=corei7 %s -o - | FileCheck %s --check-prefixes=64-ALL,64-GOOD-RA-SAHF,64-GOOD-RA-SAHF-COREI7 declare i32 @foo() declare i32 @bar(i64) @@ -252,22 +252,90 @@ ; 32-FAST-RA-NEXT: popl %ebx ; 32-FAST-RA-NEXT: retl ; -; 64-ALL-LABEL: test_feed_cmov: -; 64-ALL: # %bb.0: # %entry -; 64-ALL-NEXT: pushq %rbp -; 64-ALL-NEXT: pushq %rbx -; 64-ALL-NEXT: pushq %rax -; 64-ALL-NEXT: movl %edx, %ebx -; 64-ALL-NEXT: movl %esi, %eax -; 64-ALL-NEXT: lock cmpxchgl %edx, (%rdi) -; 64-ALL-NEXT: sete %bpl -; 64-ALL-NEXT: callq foo -; 64-ALL-NEXT: testb %bpl, %bpl -; 64-ALL-NEXT: cmovnel %ebx, %eax -; 64-ALL-NEXT: addq $8, %rsp -; 64-ALL-NEXT: popq %rbx -; 64-ALL-NEXT: popq %rbp -; 64-ALL-NEXT: retq +; 64-GOOD-RA-LABEL: test_feed_cmov: +; 64-GOOD-RA: # %bb.0: # %entry +; 64-GOOD-RA-NEXT: pushq %rbp +; 64-GOOD-RA-NEXT: pushq %rbx +; 64-GOOD-RA-NEXT: pushq %rax +; 64-GOOD-RA-NEXT: movl %edx, %ebx +; 64-GOOD-RA-NEXT: movl %esi, %eax +; 64-GOOD-RA-NEXT: lock cmpxchgl %edx, (%rdi) +; 64-GOOD-RA-NEXT: sete %bpl +; 64-GOOD-RA-NEXT: callq foo +; 64-GOOD-RA-NEXT: testb %bpl, %bpl +; 64-GOOD-RA-NEXT: cmovnel %ebx, %eax +; 64-GOOD-RA-NEXT: addq $8, %rsp +; 64-GOOD-RA-NEXT: popq %rbx +; 64-GOOD-RA-NEXT: popq %rbp +; 64-GOOD-RA-NEXT: retq +; +; 64-FAST-RA-LABEL: test_feed_cmov: +; 64-FAST-RA: # %bb.0: # %entry +; 64-FAST-RA-NEXT: pushq %rbp +; 64-FAST-RA-NEXT: pushq %rbx +; 64-FAST-RA-NEXT: pushq %rax +; 64-FAST-RA-NEXT: movl %edx, %ebx +; 64-FAST-RA-NEXT: movl %esi, %eax +; 64-FAST-RA-NEXT: lock cmpxchgl %edx, (%rdi) +; 64-FAST-RA-NEXT: sete %bpl +; 64-FAST-RA-NEXT: callq foo +; 64-FAST-RA-NEXT: testb %bpl, %bpl +; 64-FAST-RA-NEXT: cmovnel %ebx, %eax +; 64-FAST-RA-NEXT: addq $8, %rsp +; 64-FAST-RA-NEXT: popq %rbx +; 64-FAST-RA-NEXT: popq %rbp +; 64-FAST-RA-NEXT: retq +; +; 64-GOOD-RA-SAHF-GENERIC-LABEL: test_feed_cmov: +; 64-GOOD-RA-SAHF-GENERIC: # %bb.0: # %entry +; 64-GOOD-RA-SAHF-GENERIC-NEXT: pushq %rbp +; 64-GOOD-RA-SAHF-GENERIC-NEXT: pushq %rbx +; 64-GOOD-RA-SAHF-GENERIC-NEXT: pushq %rax +; 64-GOOD-RA-SAHF-GENERIC-NEXT: movl %edx, %ebx +; 64-GOOD-RA-SAHF-GENERIC-NEXT: movl %esi, %eax +; 64-GOOD-RA-SAHF-GENERIC-NEXT: lock cmpxchgl %edx, (%rdi) +; 64-GOOD-RA-SAHF-GENERIC-NEXT: sete %bpl +; 64-GOOD-RA-SAHF-GENERIC-NEXT: callq foo +; 64-GOOD-RA-SAHF-GENERIC-NEXT: testb %bpl, %bpl +; 64-GOOD-RA-SAHF-GENERIC-NEXT: cmovnel %ebx, %eax +; 64-GOOD-RA-SAHF-GENERIC-NEXT: addq $8, %rsp +; 64-GOOD-RA-SAHF-GENERIC-NEXT: popq %rbx +; 64-GOOD-RA-SAHF-GENERIC-NEXT: popq %rbp +; 64-GOOD-RA-SAHF-GENERIC-NEXT: retq +; +; 64-FAST-RA-SAHF-LABEL: test_feed_cmov: +; 64-FAST-RA-SAHF: # %bb.0: # %entry +; 64-FAST-RA-SAHF-NEXT: pushq %rbp +; 64-FAST-RA-SAHF-NEXT: pushq %rbx +; 64-FAST-RA-SAHF-NEXT: pushq %rax +; 64-FAST-RA-SAHF-NEXT: movl %edx, %ebx +; 64-FAST-RA-SAHF-NEXT: movl %esi, %eax +; 64-FAST-RA-SAHF-NEXT: lock cmpxchgl %edx, (%rdi) +; 64-FAST-RA-SAHF-NEXT: sete %bpl +; 64-FAST-RA-SAHF-NEXT: callq foo +; 64-FAST-RA-SAHF-NEXT: testb %bpl, %bpl +; 64-FAST-RA-SAHF-NEXT: cmovnel %ebx, %eax +; 64-FAST-RA-SAHF-NEXT: addq $8, %rsp +; 64-FAST-RA-SAHF-NEXT: popq %rbx +; 64-FAST-RA-SAHF-NEXT: popq %rbp +; 64-FAST-RA-SAHF-NEXT: retq +; +; 64-GOOD-RA-SAHF-COREI7-LABEL: test_feed_cmov: +; 64-GOOD-RA-SAHF-COREI7: # %bb.0: # %entry +; 64-GOOD-RA-SAHF-COREI7-NEXT: pushq %rbp +; 64-GOOD-RA-SAHF-COREI7-NEXT: pushq %rbx +; 64-GOOD-RA-SAHF-COREI7-NEXT: pushq %rax +; 64-GOOD-RA-SAHF-COREI7-NEXT: movl %esi, %eax +; 64-GOOD-RA-SAHF-COREI7-NEXT: movl %edx, %ebx +; 64-GOOD-RA-SAHF-COREI7-NEXT: lock cmpxchgl %edx, (%rdi) +; 64-GOOD-RA-SAHF-COREI7-NEXT: sete %bpl +; 64-GOOD-RA-SAHF-COREI7-NEXT: callq foo +; 64-GOOD-RA-SAHF-COREI7-NEXT: testb %bpl, %bpl +; 64-GOOD-RA-SAHF-COREI7-NEXT: cmovnel %ebx, %eax +; 64-GOOD-RA-SAHF-COREI7-NEXT: addq $8, %rsp +; 64-GOOD-RA-SAHF-COREI7-NEXT: popq %rbx +; 64-GOOD-RA-SAHF-COREI7-NEXT: popq %rbp +; 64-GOOD-RA-SAHF-COREI7-NEXT: retq entry: %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst %success = extractvalue { i32, i1 } %res, 1 diff --git a/llvm/test/CodeGen/X86/cmpxchg8b.ll b/llvm/test/CodeGen/X86/cmpxchg8b.ll --- a/llvm/test/CodeGen/X86/cmpxchg8b.ll +++ b/llvm/test/CodeGen/X86/cmpxchg8b.ll @@ -1,24 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown- -mcpu=core2 | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=i686-unknown- -mcpu=core2 | FileCheck %s --check-prefixes=CHECK,X86,X86-CORE2 ; RUN: llc < %s -mtriple=x86_64-unknown- -mcpu=core2 | FileCheck %s --check-prefixes=CHECK,X64 ; RUN: llc < %s -mtriple=i686-unknown- -mcpu=i486 | FileCheck %s --check-prefixes=I486 -; RUN: llc < %s -mtriple=i686-unknown- -mcpu=znver1 | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=i686-unknown- -mcpu=znver1 | FileCheck %s --check-prefixes=CHECK,X86,X86-ZNVER1 ; Basic 64-bit cmpxchg define void @t1(i64* nocapture %p) nounwind ssp { -; X86-LABEL: t1: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: movl $1, %ebx -; X86-NEXT: lock cmpxchg8b (%esi) -; X86-NEXT: popl %esi -; X86-NEXT: popl %ebx -; X86-NEXT: retl +; X86-CORE2-LABEL: t1: +; X86-CORE2: # %bb.0: # %entry +; X86-CORE2-NEXT: pushl %ebx +; X86-CORE2-NEXT: pushl %esi +; X86-CORE2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-CORE2-NEXT: movl $1, %ebx +; X86-CORE2-NEXT: xorl %eax, %eax +; X86-CORE2-NEXT: xorl %edx, %edx +; X86-CORE2-NEXT: xorl %ecx, %ecx +; X86-CORE2-NEXT: lock cmpxchg8b (%esi) +; X86-CORE2-NEXT: popl %esi +; X86-CORE2-NEXT: popl %ebx +; X86-CORE2-NEXT: retl ; ; X64-LABEL: t1: ; X64: # %bb.0: # %entry @@ -48,6 +48,20 @@ ; I486-NEXT: movl %ebp, %esp ; I486-NEXT: popl %ebp ; I486-NEXT: retl +; +; X86-ZNVER1-LABEL: t1: +; X86-ZNVER1: # %bb.0: # %entry +; X86-ZNVER1-NEXT: pushl %ebx +; X86-ZNVER1-NEXT: pushl %esi +; X86-ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-ZNVER1-NEXT: xorl %eax, %eax +; X86-ZNVER1-NEXT: xorl %edx, %edx +; X86-ZNVER1-NEXT: xorl %ecx, %ecx +; X86-ZNVER1-NEXT: movl $1, %ebx +; X86-ZNVER1-NEXT: lock cmpxchg8b (%esi) +; X86-ZNVER1-NEXT: popl %esi +; X86-ZNVER1-NEXT: popl %ebx +; X86-ZNVER1-NEXT: retl entry: %r = cmpxchg i64* %p, i64 0, i64 1 seq_cst seq_cst ret void diff --git a/llvm/test/CodeGen/X86/coalescer-commute1.ll b/llvm/test/CodeGen/X86/coalescer-commute1.ll --- a/llvm/test/CodeGen/X86/coalescer-commute1.ll +++ b/llvm/test/CodeGen/X86/coalescer-commute1.ll @@ -8,17 +8,17 @@ define void @runcont(i32* %source) nounwind { ; CHECK-LABEL: runcont: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl L_NNTOT$non_lazy_ptr, %ecx -; CHECK-NEXT: movl (%ecx), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: movl (%ecx), %ecx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_1: ## %bb ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vcvtsi2ssl (%eax,%edx,4), %xmm2, %xmm1 -; CHECK-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: incl %edx +; CHECK-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: cmpl %edx, %ecx ; CHECK-NEXT: jne LBB0_1 ; CHECK-NEXT: ## %bb.2: ## %bb13 diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll --- a/llvm/test/CodeGen/X86/combine-or.ll +++ b/llvm/test/CodeGen/X86/combine-or.ll @@ -239,8 +239,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> @@ -254,9 +254,9 @@ ; CHECK-LABEL: test19: ; CHECK: # %bb.0: ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] ; CHECK-NEXT: pxor %xmm3, %xmm3 ; CHECK-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] ; CHECK-NEXT: por %xmm2, %xmm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/commute-two-addr.ll b/llvm/test/CodeGen/X86/commute-two-addr.ll --- a/llvm/test/CodeGen/X86/commute-two-addr.ll +++ b/llvm/test/CodeGen/X86/commute-two-addr.ll @@ -38,9 +38,9 @@ define %0 @t3(i32 %lb, i8 zeroext %has_lb, i8 zeroext %lb_inclusive, i32 %ub, i8 zeroext %has_ub, i8 zeroext %ub_inclusive) nounwind { entry: ; DARWIN-LABEL: t3: +; DARWIN: shll $8 ; DARWIN: shlq $32, %rcx -; DARWIN-NEXT: orq %rcx, %rax -; DARWIN-NEXT: shll $8 +; DARWIN: orq %rcx, %rax ; DARWIN-NOT: leaq %tmp21 = zext i32 %lb to i64 %tmp23 = zext i32 %ub to i64 diff --git a/llvm/test/CodeGen/X86/cvtv2f32.ll b/llvm/test/CodeGen/X86/cvtv2f32.ll --- a/llvm/test/CodeGen/X86/cvtv2f32.ll +++ b/llvm/test/CodeGen/X86/cvtv2f32.ll @@ -10,12 +10,12 @@ ; X32: # %bb.0: ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; X32-NEXT: orpd %xmm2, %xmm1 -; X32-NEXT: subsd %xmm2, %xmm1 -; X32-NEXT: cvtsd2ss %xmm1, %xmm1 ; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X32-NEXT: orpd %xmm2, %xmm1 ; X32-NEXT: orpd %xmm2, %xmm3 +; X32-NEXT: subsd %xmm2, %xmm1 ; X32-NEXT: subsd %xmm2, %xmm3 +; X32-NEXT: cvtsd2ss %xmm1, %xmm1 ; X32-NEXT: xorps %xmm2, %xmm2 ; X32-NEXT: cvtsd2ss %xmm3, %xmm2 ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] @@ -42,8 +42,8 @@ define <2 x float> @uitofp_2i32_buildvector_cvt(i32 %x, i32 %y, <2 x float> %v) { ; X32-LABEL: uitofp_2i32_buildvector_cvt: ; X32: # %bb.0: -; X32-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] ; X32-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X32-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] ; X32-NEXT: por %xmm1, %xmm2 ; X32-NEXT: subpd %xmm1, %xmm2 ; X32-NEXT: cvtpd2ps %xmm2, %xmm1 @@ -52,10 +52,10 @@ ; ; X64-LABEL: uitofp_2i32_buildvector_cvt: ; X64: # %bb.0: +; X64-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] ; X64-NEXT: movd %edi, %xmm1 ; X64-NEXT: pinsrd $1, %esi, %xmm1 ; X64-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; X64-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] ; X64-NEXT: por %xmm2, %xmm1 ; X64-NEXT: subpd %xmm2, %xmm1 ; X64-NEXT: cvtpd2ps %xmm1, %xmm1 @@ -71,8 +71,8 @@ define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) { ; X32-LABEL: uitofp_2i32_legalized: ; X32: # %bb.0: -; X32-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X32-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] +; X32-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X32-NEXT: por %xmm2, %xmm0 ; X32-NEXT: subpd %xmm2, %xmm0 ; X32-NEXT: cvtpd2ps %xmm0, %xmm0 @@ -81,8 +81,8 @@ ; ; X64-LABEL: uitofp_2i32_legalized: ; X64: # %bb.0: -; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X64-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] +; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X64-NEXT: por %xmm2, %xmm0 ; X64-NEXT: subpd %xmm2, %xmm0 ; X64-NEXT: cvtpd2ps %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/dagcombine-buildvector.ll b/llvm/test/CodeGen/X86/dagcombine-buildvector.ll --- a/llvm/test/CodeGen/X86/dagcombine-buildvector.ll +++ b/llvm/test/CodeGen/X86/dagcombine-buildvector.ll @@ -20,8 +20,8 @@ define void @test2(<4 x i16>* %src, <4 x i32>* %dest) nounwind { ; CHECK-LABEL: test2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; CHECK-NEXT: movdqa %xmm0, (%eax) ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll b/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll --- a/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll +++ b/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll @@ -13,14 +13,14 @@ ; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: paddd (%edx), %xmm0 +; CHECK-NEXT: shll $4, %ecx ; CHECK-NEXT: movdqa %xmm0, (%edx) ; CHECK-NEXT: movl (%edx), %esi ; CHECK-NEXT: movl 4(%edx), %edi -; CHECK-NEXT: shll $4, %ecx ; CHECK-NEXT: movl 8(%edx), %ebx ; CHECK-NEXT: movl 12(%edx), %edx ; CHECK-NEXT: movl %esi, 12(%eax,%ecx) diff --git a/llvm/test/CodeGen/X86/fmf-flags.ll b/llvm/test/CodeGen/X86/fmf-flags.ll --- a/llvm/test/CodeGen/X86/fmf-flags.ll +++ b/llvm/test/CodeGen/X86/fmf-flags.ll @@ -10,8 +10,8 @@ ; X64-NEXT: rsqrtss %xmm0, %xmm1 ; X64-NEXT: mulss %xmm1, %xmm0 ; X64-NEXT: mulss %xmm1, %xmm0 -; X64-NEXT: addss {{.*}}(%rip), %xmm0 ; X64-NEXT: mulss {{.*}}(%rip), %xmm1 +; X64-NEXT: addss {{.*}}(%rip), %xmm0 ; X64-NEXT: mulss %xmm1, %xmm0 ; X64-NEXT: retq ; @@ -84,10 +84,10 @@ ; X64-NEXT: sqrtss %xmm0, %xmm2 ; X64-NEXT: mulss %xmm1, %xmm0 ; X64-NEXT: mulss %xmm1, %xmm0 -; X64-NEXT: addss {{.*}}(%rip), %xmm0 ; X64-NEXT: mulss {{.*}}(%rip), %xmm1 -; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: addss {{.*}}(%rip), %xmm0 ; X64-NEXT: movss %xmm2, {{.*}}(%rip) +; X64-NEXT: mulss %xmm1, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: not_so_fast_recip_sqrt: diff --git a/llvm/test/CodeGen/X86/fold-load-vec.ll b/llvm/test/CodeGen/X86/fold-load-vec.ll --- a/llvm/test/CodeGen/X86/fold-load-vec.ll +++ b/llvm/test/CodeGen/X86/fold-load-vec.ll @@ -8,9 +8,9 @@ ; CHECK-LABEL: sample_test: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: movq %rdi, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rsi, {{[0-9]+}}(%rsp) -; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: movlps %xmm0, (%rsp) ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movlps %xmm0, (%rsp) diff --git a/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll b/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll --- a/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll +++ b/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll @@ -31,6 +31,8 @@ ; X32-NEXT: LBB0_3: ## %forbody ; X32-NEXT: movaps {{.*#+}} xmm1 = [1.28E+2,1.28E+2,1.28E+2,1.28E+2] ; X32-NEXT: minps LCPI0_3, %xmm1 +; X32-NEXT: xorl %esi, %esi +; X32-NEXT: xorps %xmm3, %xmm3 ; X32-NEXT: cvttps2dq %xmm1, %xmm0 ; X32-NEXT: cvtdq2ps %xmm0, %xmm0 ; X32-NEXT: subps %xmm0, %xmm1 @@ -68,34 +70,32 @@ ; X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; X32-NEXT: minps LCPI0_3, %xmm0 +; X32-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; X32-NEXT: movl $0, (%esp) ; X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; X32-NEXT: xorps %xmm0, %xmm0 ; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: movl $0, (%esp) -; X32-NEXT: xorl %esi, %esi -; X32-NEXT: xorps %xmm3, %xmm3 ; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; X32-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload ; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload ; X32-NEXT: calll *%esi ; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; X32-NEXT: minps LCPI0_3, %xmm0 -; X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; X32-NEXT: pxor %xmm1, %xmm1 ; X32-NEXT: psubd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; X32-NEXT: xorps %xmm3, %xmm3 +; X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; X32-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; X32-NEXT: psubd LCPI0_4, %xmm0 +; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; X32-NEXT: movl $0, (%esp) ; X32-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; X32-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; X32-NEXT: por %xmm1, %xmm0 +; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload ; X32-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; X32-NEXT: pxor %xmm0, %xmm0 ; X32-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: movl $0, (%esp) -; X32-NEXT: xorps %xmm3, %xmm3 ; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload ; X32-NEXT: calll *%esi ; X32-NEXT: ud2 ; diff --git a/llvm/test/CodeGen/X86/fp-load-trunc.ll b/llvm/test/CodeGen/X86/fp-load-trunc.ll --- a/llvm/test/CodeGen/X86/fp-load-trunc.ll +++ b/llvm/test/CodeGen/X86/fp-load-trunc.ll @@ -71,8 +71,8 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: cvtpd2ps 16(%eax), %xmm1 ; CHECK-NEXT: cvtpd2ps (%eax), %xmm0 -; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: cvtpd2ps 48(%eax), %xmm2 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: cvtpd2ps 32(%eax), %xmm1 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/fp-trunc.ll b/llvm/test/CodeGen/X86/fp-trunc.ll --- a/llvm/test/CodeGen/X86/fp-trunc.ll +++ b/llvm/test/CodeGen/X86/fp-trunc.ll @@ -61,12 +61,12 @@ ; CHECK-LABEL: test4: ; CHECK: # %bb.0: ; CHECK-NEXT: subl $12, %esp +; CHECK-NEXT: cvtpd2ps {{[0-9]+}}(%esp), %xmm3 ; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1 ; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: cvtpd2ps %xmm2, %xmm1 -; CHECK-NEXT: cvtpd2ps {{[0-9]+}}(%esp), %xmm2 -; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; CHECK-NEXT: addl $12, %esp ; CHECK-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll --- a/llvm/test/CodeGen/X86/fp128-i128.ll +++ b/llvm/test/CodeGen/X86/fp128-i128.ll @@ -49,13 +49,13 @@ ; SSE-LABEL: TestUnionLD1: ; SSE: # %bb.0: # %entry ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movabsq $281474976710655, %rdx # imm = 0xFFFFFFFFFFFF ; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: shlq $48, %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movabsq $281474976710655, %rdx # imm = 0xFFFFFFFFFFFF ; SSE-NEXT: andq %rdi, %rdx -; SSE-NEXT: orq %rax, %rdx +; SSE-NEXT: shlq $48, %rax ; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: orq %rax, %rdx ; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: jmp foo # TAILCALL @@ -63,13 +63,13 @@ ; AVX-LABEL: TestUnionLD1: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movabsq $281474976710655, %rdx # imm = 0xFFFFFFFFFFFF ; AVX-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax -; AVX-NEXT: shlq $48, %rax ; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movabsq $281474976710655, %rdx # imm = 0xFFFFFFFFFFFF ; AVX-NEXT: andq %rdi, %rdx -; AVX-NEXT: orq %rax, %rdx +; AVX-NEXT: shlq $48, %rax ; AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX-NEXT: orq %rax, %rdx ; AVX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: jmp foo # TAILCALL @@ -99,8 +99,8 @@ ; SSE: # %bb.0: # %entry ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: retq ; @@ -108,8 +108,8 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: retq entry: @@ -133,12 +133,12 @@ ; SSE-NEXT: subq $40, %rsp ; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; SSE-NEXT: movaps {{.*}}(%rip), %xmm1 ; SSE-NEXT: andq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; SSE-NEXT: movq %rcx, (%rsp) +; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; SSE-NEXT: movaps (%rsp), %xmm0 -; SSE-NEXT: movaps {{.*}}(%rip), %xmm1 ; SSE-NEXT: callq __lttf2 ; SSE-NEXT: xorl %ecx, %ecx ; SSE-NEXT: testl %eax, %eax @@ -153,12 +153,12 @@ ; AVX-NEXT: subq $40, %rsp ; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) ; AVX-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; AVX-NEXT: vmovaps {{.*}}(%rip), %xmm1 ; AVX-NEXT: andq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX-NEXT: movq %rcx, (%rsp) +; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX-NEXT: vmovaps (%rsp), %xmm0 -; AVX-NEXT: vmovaps {{.*}}(%rip), %xmm1 ; AVX-NEXT: callq __lttf2 ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: testl %eax, %eax @@ -228,8 +228,8 @@ ; SSE: # %bb.0: # %entry ; SSE-NEXT: subq $56, %rsp ; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movabsq $9223090561878065152, %rcx # imm = 0x7FFF000000000000 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: testq %rcx, %rax ; SSE-NEXT: je .LBB4_2 ; SSE-NEXT: # %bb.1: @@ -239,10 +239,10 @@ ; SSE-NEXT: movaps {{.*}}(%rip), %xmm1 ; SSE-NEXT: callq __multf3 ; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE-NEXT: movabsq $-9223090561878065153, %rdx # imm = 0x8000FFFFFFFFFFFF -; SSE-NEXT: andq {{[0-9]+}}(%rsp), %rdx ; SSE-NEXT: movabsq $4611123068473966592, %rax # imm = 0x3FFE000000000000 +; SSE-NEXT: andq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE-NEXT: orq %rdx, %rax ; SSE-NEXT: .LBB4_3: # %if.end ; SSE-NEXT: movq %rcx, (%rsp) @@ -255,8 +255,8 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: subq $56, %rsp ; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: movabsq $9223090561878065152, %rcx # imm = 0x7FFF000000000000 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: testq %rcx, %rax ; AVX-NEXT: je .LBB4_2 ; AVX-NEXT: # %bb.1: @@ -266,10 +266,10 @@ ; AVX-NEXT: vmovaps {{.*}}(%rip), %xmm1 ; AVX-NEXT: callq __multf3 ; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: movabsq $-9223090561878065153, %rdx # imm = 0x8000FFFFFFFFFFFF -; AVX-NEXT: andq {{[0-9]+}}(%rsp), %rdx ; AVX-NEXT: movabsq $4611123068473966592, %rax # imm = 0x3FFE000000000000 +; AVX-NEXT: andq {{[0-9]+}}(%rsp), %rdx +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: orq %rdx, %rax ; AVX-NEXT: .LBB4_3: # %if.end ; AVX-NEXT: movq %rcx, (%rsp) @@ -309,21 +309,21 @@ define fp128 @TestI128_4(fp128 %x) #0 { ; SSE-LABEL: TestI128_4: ; SSE: # %bb.0: # %entry -; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: jmp __addtf3 # TAILCALL ; ; AVX-LABEL: TestI128_4: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps %xmm0, %xmm1 ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovaps %xmm0, %xmm1 ; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: jmp __addtf3 # TAILCALL entry: @@ -364,21 +364,21 @@ define fp128 @acosl(fp128 %x) #0 { ; SSE-LABEL: acosl: ; SSE: # %bb.0: # %entry -; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: jmp __addtf3 # TAILCALL ; ; AVX-LABEL: acosl: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps %xmm0, %xmm1 ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovaps %xmm0, %xmm1 ; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: jmp __addtf3 # TAILCALL entry: @@ -446,14 +446,14 @@ ; SSE-NEXT: pushq %rbp ; SSE-NEXT: pushq %rbx ; SSE-NEXT: subq $40, %rsp -; SSE-NEXT: movq %rdi, %rbx ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movq %rdi, %rbx ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: callq __gttf2 -; SSE-NEXT: movl %eax, %ebp ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movl %eax, %ebp ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: callq __subtf3 ; SSE-NEXT: testl %ebp, %ebp @@ -469,10 +469,10 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: andps {{.*}}(%rip), %xmm2 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: orps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm1, (%rbx) -; SSE-NEXT: movaps %xmm0, 16(%rbx) ; SSE-NEXT: movq %rbx, %rax +; SSE-NEXT: orps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm0, 16(%rbx) ; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: popq %rbx ; SSE-NEXT: popq %rbp @@ -483,14 +483,14 @@ ; AVX-NEXT: pushq %rbp ; AVX-NEXT: pushq %rbx ; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: movq %rdi, %rbx ; AVX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 +; AVX-NEXT: movq %rdi, %rbx ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: callq __gttf2 -; AVX-NEXT: movl %eax, %ebp ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: movl %eax, %ebp ; AVX-NEXT: vmovaps %xmm0, %xmm1 ; AVX-NEXT: callq __subtf3 ; AVX-NEXT: testl %ebp, %ebp @@ -505,10 +505,10 @@ ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovaps %xmm2, (%rbx) -; AVX-NEXT: vmovaps %xmm0, 16(%rbx) ; AVX-NEXT: movq %rbx, %rax +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, 16(%rbx) ; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/gather-addresses.ll b/llvm/test/CodeGen/X86/gather-addresses.ll --- a/llvm/test/CodeGen/X86/gather-addresses.ll +++ b/llvm/test/CodeGen/X86/gather-addresses.ll @@ -40,12 +40,12 @@ ; LIN-SSE4-NEXT: pextrd $2, %xmm0, %edx ; LIN-SSE4-NEXT: pextrd $3, %xmm0, %esi ; LIN-SSE4-NEXT: cltq -; LIN-SSE4-NEXT: movslq %ecx, %rcx ; LIN-SSE4-NEXT: movslq %edx, %rdx ; LIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; LIN-SSE4-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; LIN-SSE4-NEXT: movslq %ecx, %rcx ; LIN-SSE4-NEXT: movslq %esi, %rax ; LIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; LIN-SSE4-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; LIN-SSE4-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; LIN-SSE4-NEXT: retq ; @@ -79,12 +79,12 @@ ; WIN-SSE4-NEXT: pextrd $2, %xmm0, %r8d ; WIN-SSE4-NEXT: pextrd $3, %xmm0, %r9d ; WIN-SSE4-NEXT: cltq -; WIN-SSE4-NEXT: movslq %edx, %rdx ; WIN-SSE4-NEXT: movslq %r8d, %r8 ; WIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; WIN-SSE4-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; WIN-SSE4-NEXT: movslq %edx, %rdx ; WIN-SSE4-NEXT: movslq %r9d, %rax ; WIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; WIN-SSE4-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; WIN-SSE4-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; WIN-SSE4-NEXT: retq ; @@ -92,18 +92,18 @@ ; LIN32: # %bb.0: ; LIN32-NEXT: pushl %edi ; LIN32-NEXT: pushl %esi -; LIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; LIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; LIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; LIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; LIN32-NEXT: movdqa (%edx), %xmm0 ; LIN32-NEXT: pand (%ecx), %xmm0 -; LIN32-NEXT: pextrd $1, %xmm0, %ecx ; LIN32-NEXT: pextrd $2, %xmm0, %edx +; LIN32-NEXT: pextrd $1, %xmm0, %ecx ; LIN32-NEXT: pextrd $3, %xmm0, %esi ; LIN32-NEXT: movd %xmm0, %edi ; LIN32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; LIN32-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; LIN32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; LIN32-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; LIN32-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; LIN32-NEXT: popl %esi ; LIN32-NEXT: popl %edi @@ -161,18 +161,18 @@ ; LIN-SSE4: # %bb.0: ; LIN-SSE4-NEXT: movdqa (%rsi), %xmm0 ; LIN-SSE4-NEXT: pand (%rdx), %xmm0 -; LIN-SSE4-NEXT: movd %xmm0, %eax ; LIN-SSE4-NEXT: pextrd $1, %xmm0, %edx +; LIN-SSE4-NEXT: movd %xmm0, %eax ; LIN-SSE4-NEXT: pextrd $2, %xmm0, %esi ; LIN-SSE4-NEXT: pextrd $3, %xmm0, %edi ; LIN-SSE4-NEXT: andl %ecx, %eax ; LIN-SSE4-NEXT: andl %ecx, %edx -; LIN-SSE4-NEXT: andl %ecx, %esi ; LIN-SSE4-NEXT: andl %ecx, %edi -; LIN-SSE4-NEXT: movq %rdx, %xmm1 +; LIN-SSE4-NEXT: andl %ecx, %esi ; LIN-SSE4-NEXT: movq %rax, %xmm0 -; LIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; LIN-SSE4-NEXT: movq %rdx, %xmm1 ; LIN-SSE4-NEXT: movq %rdi, %xmm2 +; LIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; LIN-SSE4-NEXT: movq %rsi, %xmm1 ; LIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; LIN-SSE4-NEXT: retq @@ -204,18 +204,18 @@ ; WIN-SSE4: # %bb.0: ; WIN-SSE4-NEXT: movdqa (%rdx), %xmm0 ; WIN-SSE4-NEXT: pand (%r8), %xmm0 -; WIN-SSE4-NEXT: movd %xmm0, %eax ; WIN-SSE4-NEXT: pextrd $1, %xmm0, %ecx +; WIN-SSE4-NEXT: movd %xmm0, %eax ; WIN-SSE4-NEXT: pextrd $2, %xmm0, %r8d ; WIN-SSE4-NEXT: pextrd $3, %xmm0, %edx ; WIN-SSE4-NEXT: andl %r9d, %eax ; WIN-SSE4-NEXT: andl %r9d, %ecx -; WIN-SSE4-NEXT: andl %r9d, %r8d ; WIN-SSE4-NEXT: andl %r9d, %edx -; WIN-SSE4-NEXT: movq %rcx, %xmm1 +; WIN-SSE4-NEXT: andl %r9d, %r8d ; WIN-SSE4-NEXT: movq %rax, %xmm0 -; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; WIN-SSE4-NEXT: movq %rcx, %xmm1 ; WIN-SSE4-NEXT: movq %rdx, %xmm2 +; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; WIN-SSE4-NEXT: movq %r8, %xmm1 ; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; WIN-SSE4-NEXT: retq @@ -224,23 +224,23 @@ ; LIN32: # %bb.0: ; LIN32-NEXT: pushl %edi ; LIN32-NEXT: pushl %esi -; LIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; LIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; LIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; LIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; LIN32-NEXT: movdqa (%edx), %xmm0 ; LIN32-NEXT: pand (%ecx), %xmm0 -; LIN32-NEXT: movd %xmm0, %ecx ; LIN32-NEXT: pextrd $1, %xmm0, %edx +; LIN32-NEXT: movd %xmm0, %ecx ; LIN32-NEXT: pextrd $2, %xmm0, %esi ; LIN32-NEXT: pextrd $3, %xmm0, %edi ; LIN32-NEXT: andl %eax, %ecx ; LIN32-NEXT: andl %eax, %edx -; LIN32-NEXT: andl %eax, %esi ; LIN32-NEXT: andl %eax, %edi -; LIN32-NEXT: movd %edx, %xmm1 +; LIN32-NEXT: andl %eax, %esi ; LIN32-NEXT: movd %ecx, %xmm0 -; LIN32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; LIN32-NEXT: movd %edx, %xmm1 ; LIN32-NEXT: movd %edi, %xmm2 +; LIN32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; LIN32-NEXT: movd %esi, %xmm1 ; LIN32-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; LIN32-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/inreg.ll b/llvm/test/CodeGen/X86/inreg.ll --- a/llvm/test/CodeGen/X86/inreg.ll +++ b/llvm/test/CodeGen/X86/inreg.ll @@ -10,10 +10,10 @@ ret void ; DAG-LABEL: g1: ; DAG: subl $[[AMT:.*]], %esp - ; DAG-NEXT: $43, (%esp) ; DAG-NEXT: leal 16(%esp), %eax ; DAG-NEXT: movl $41, %edx ; DAG-NEXT: movl $42, %ecx + ; DAG-NEXT: $43, (%esp) ; DAG-NEXT: calll f ; DAG-NEXT: addl $[[AMT]], %esp ; DAG-NEXT: ret diff --git a/llvm/test/CodeGen/X86/lifetime-alias.ll b/llvm/test/CodeGen/X86/lifetime-alias.ll --- a/llvm/test/CodeGen/X86/lifetime-alias.ll +++ b/llvm/test/CodeGen/X86/lifetime-alias.ll @@ -29,50 +29,50 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [97,97,97,97,97,97,97,97,97,97,97,97,97,97,97,97] -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movabsq $8389209137051166804, %r9 # imm = 0x746C754320656854 +; CHECK-NEXT: movups {{.*}}(%rip), %xmm1 +; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movabsq $7308613581744070988, %rdx # imm = 0x656D69547473614C +; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movabsq $7016996765293437281, %rax # imm = 0x6161616161616161 -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $5632, {{[0-9]+}}(%rsp) # imm = 0x1600 +; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $11, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movabsq $8389209137051166804, %rax # imm = 0x746C754320656854 -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $1701999988, -{{[0-9]+}}(%rsp) # imm = 0x65727574 ; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $21, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movabsq $7308613581744070988, %rax # imm = 0x656D69547473614C -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movups {{.*}}(%rip), %xmm1 +; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb -{{[0-9]+}}(%rsp), %al -; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb -{{[0-9]+}}(%rsp), %al -; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $5632, {{[0-9]+}}(%rsp) # imm = 0x1600 +; CHECK-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %r9 +; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: movzwl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %r8w, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: leaq -{{[0-9]+}}(%rsp), %rdx ; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq %rax, do_not_optimize{{.*}}(%rip) -; CHECK-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq %rax, do_not_optimize{{.*}}(%rip) +; CHECK-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: movq %rdx, {{.*}}(%rip) +; CHECK-NEXT: movq %rcx, {{.*}}(%rip) ; CHECK-NEXT: cmpb $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: jns .LBB0_1 ; CHECK-NEXT: # %bb.2: # %_ZNSt3__312basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEED2Ev.exit50 diff --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll --- a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll +++ b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll @@ -16,8 +16,8 @@ ; AVX2-LABEL: reassociate_and_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_and_v4i32: @@ -43,8 +43,8 @@ ; AVX2-LABEL: reassociate_or_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_or_v4i32: @@ -70,8 +70,8 @@ ; AVX2-LABEL: reassociate_xor_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_xor_v4i32: @@ -93,17 +93,17 @@ ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: paddd %xmm3, %xmm1 ; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_and_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_and_v8i32: @@ -123,17 +123,17 @@ ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: paddd %xmm3, %xmm1 ; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_or_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_or_v8i32: @@ -153,17 +153,17 @@ ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm6, %xmm4 -; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: paddd %xmm3, %xmm1 ; SSE-NEXT: pxor %xmm7, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: pxor %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_xor_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_xor_v8i32: @@ -200,12 +200,12 @@ ; ; AVX2-LABEL: reassociate_and_v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm7, %ymm5, %ymm3 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_and_v16i32: @@ -239,12 +239,12 @@ ; ; AVX2-LABEL: reassociate_or_v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm7, %ymm5, %ymm3 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_or_v16i32: @@ -278,12 +278,12 @@ ; ; AVX2-LABEL: reassociate_xor_v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm7, %ymm5, %ymm3 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_xor_v16i32: @@ -311,8 +311,8 @@ ; AVX-LABEL: reassociate_umax_v16i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxub %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmaxub %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <16 x i8> %x0, %x1 @@ -326,21 +326,21 @@ define <8 x i16> @reassociate_umax_v8i16(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, <8 x i16> %x3) { ; SSE-LABEL: reassociate_umax_v8i16: ; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] ; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE-NEXT: pxor %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: pmaxsw %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm3 ; SSE-NEXT: pmaxsw %xmm3, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_umax_v8i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxuw %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmaxuw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <8 x i16> %x0, %x1 @@ -354,31 +354,31 @@ define <4 x i32> @reassociate_umax_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, <4 x i32> %x3) { ; SSE-LABEL: reassociate_umax_v4i32: ; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] ; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] ; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pxor %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pxor %xmm1, %xmm5 +; SSE-NEXT: pxor %xmm6, %xmm4 +; SSE-NEXT: pxor %xmm6, %xmm5 ; SSE-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm6, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm6 +; SSE-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_umax_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxud %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpmaxud %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <4 x i32> %x0, %x1 @@ -395,16 +395,16 @@ ; SSE-NEXT: paddq %xmm1, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pxor %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pxor %xmm1, %xmm4 ; SSE-NEXT: pxor %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm7, %xmm4 ; SSE-NEXT: por %xmm4, %xmm5 ; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm5 @@ -413,12 +413,12 @@ ; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm5, %xmm0 @@ -442,8 +442,8 @@ ; AVX512-LABEL: reassociate_umax_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxuq %xmm3, %xmm2, %xmm1 -; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxuq %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vpmaxuq %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: retq %t0 = add <2 x i64> %x0, %x1 @@ -458,23 +458,23 @@ ; SSE-LABEL: reassociate_smax_v16i8: ; SSE: # %bb.0: ; SSE-NEXT: paddb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pcmpgtb %xmm0, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pcmpgtb %xmm4, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_smax_v16i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxsb %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmaxsb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <16 x i8> %x0, %x1 @@ -496,8 +496,8 @@ ; AVX-LABEL: reassociate_smax_v8i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxsw %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmaxsw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <8 x i16> %x0, %x1 @@ -512,23 +512,23 @@ ; SSE-LABEL: reassociate_smax_v4i32: ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pcmpgtd %xmm4, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_smax_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxsd %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmaxsd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <4 x i32> %x0, %x1 @@ -545,16 +545,16 @@ ; SSE-NEXT: paddq %xmm1, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pxor %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pxor %xmm1, %xmm4 ; SSE-NEXT: pxor %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm7, %xmm4 ; SSE-NEXT: por %xmm4, %xmm5 ; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm5 @@ -563,12 +563,12 @@ ; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm5, %xmm0 @@ -587,8 +587,8 @@ ; AVX512-LABEL: reassociate_smax_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxsq %xmm3, %xmm2, %xmm1 -; AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsq %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vpmaxsq %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: retq %t0 = add <2 x i64> %x0, %x1 @@ -610,8 +610,8 @@ ; AVX-LABEL: reassociate_umin_v16i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminub %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminub %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <16 x i8> %x0, %x1 @@ -625,21 +625,21 @@ define <8 x i16> @reassociate_umin_v8i16(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, <8 x i16> %x3) { ; SSE-LABEL: reassociate_umin_v8i16: ; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] ; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE-NEXT: pxor %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: pminsw %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm3 ; SSE-NEXT: pminsw %xmm3, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_umin_v8i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminuw %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminuw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpminuw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <8 x i16> %x0, %x1 @@ -653,20 +653,20 @@ define <4 x i32> @reassociate_umin_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, <4 x i32> %x3) { ; SSE-LABEL: reassociate_umin_v4i32: ; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] ; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] ; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pxor %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pxor %xmm1, %xmm5 +; SSE-NEXT: pxor %xmm6, %xmm4 +; SSE-NEXT: pxor %xmm6, %xmm5 ; SSE-NEXT: pcmpgtd %xmm4, %xmm5 ; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm6, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm6 +; SSE-NEXT: pcmpgtd %xmm6, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 @@ -675,8 +675,8 @@ ; AVX-LABEL: reassociate_umin_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminud %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminud %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <4 x i32> %x0, %x1 @@ -693,16 +693,16 @@ ; SSE-NEXT: paddq %xmm1, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pxor %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: pxor %xmm1, %xmm5 +; SSE-NEXT: pxor %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm7, %xmm4 ; SSE-NEXT: por %xmm4, %xmm5 ; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm5 @@ -712,11 +712,11 @@ ; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm5, %xmm0 @@ -740,8 +740,8 @@ ; AVX512-LABEL: reassociate_umin_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpminuq %xmm3, %xmm2, %xmm1 -; AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminuq %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vpminuq %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: retq %t0 = add <2 x i64> %x0, %x1 @@ -771,8 +771,8 @@ ; AVX-LABEL: reassociate_smin_v16i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminsb %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminsb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpminsb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <16 x i8> %x0, %x1 @@ -794,8 +794,8 @@ ; AVX-LABEL: reassociate_smin_v8i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminsw %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminsw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpminsw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <8 x i16> %x0, %x1 @@ -825,8 +825,8 @@ ; AVX-LABEL: reassociate_smin_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminsd %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminsd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpminsd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <4 x i32> %x0, %x1 @@ -843,16 +843,16 @@ ; SSE-NEXT: paddq %xmm1, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pxor %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: pxor %xmm1, %xmm5 +; SSE-NEXT: pxor %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm7, %xmm4 ; SSE-NEXT: por %xmm4, %xmm5 ; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm5 @@ -862,11 +862,11 @@ ; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm5, %xmm0 @@ -885,8 +885,8 @@ ; AVX512-LABEL: reassociate_smin_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpminsq %xmm3, %xmm2, %xmm1 -; AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsq %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vpminsq %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: retq %t0 = add <2 x i64> %x0, %x1 @@ -903,18 +903,18 @@ ; SSE-LABEL: reassociate_umax_v32i8: ; SSE: # %bb.0: ; SSE-NEXT: paddb %xmm2, %xmm0 -; SSE-NEXT: paddb %xmm3, %xmm1 ; SSE-NEXT: pmaxub %xmm6, %xmm4 -; SSE-NEXT: pmaxub %xmm4, %xmm0 +; SSE-NEXT: paddb %xmm3, %xmm1 ; SSE-NEXT: pmaxub %xmm7, %xmm5 +; SSE-NEXT: pmaxub %xmm4, %xmm0 ; SSE-NEXT: pmaxub %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_umax_v32i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxub %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpmaxub %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vpmaxub %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <32 x i8> %x0, %x1 @@ -928,28 +928,28 @@ define <16 x i16> @reassociate_umax_v16i16(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, <16 x i16> %x3) { ; SSE-LABEL: reassociate_umax_v16i16: ; SSE: # %bb.0: -; SSE-NEXT: paddw %xmm2, %xmm0 ; SSE-NEXT: paddw %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE-NEXT: pxor %xmm2, %xmm5 -; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE-NEXT: paddw %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm5 +; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: pxor %xmm3, %xmm4 +; SSE-NEXT: pxor %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm6 +; SSE-NEXT: pxor %xmm3, %xmm7 ; SSE-NEXT: pmaxsw %xmm5, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm4 -; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pmaxsw %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm6 ; SSE-NEXT: pmaxsw %xmm6, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm7 ; SSE-NEXT: pmaxsw %xmm7, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_umax_v16i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxuw %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpmaxuw %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vpmaxuw %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <16 x i16> %x0, %x1 @@ -964,37 +964,37 @@ ; SSE-LABEL: reassociate_umax_v8i32: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: paddd %xmm2, %xmm0 -; SSE-NEXT: paddd %xmm3, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: paddd %xmm3, %xmm8 ; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pxor %xmm1, %xmm2 ; SSE-NEXT: pxor %xmm1, %xmm3 ; SSE-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm8 ; SSE-NEXT: pxor %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pxor %xmm1, %xmm5 -; SSE-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE-NEXT: pxor %xmm1, %xmm8 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: pcmpgtd %xmm8, %xmm3 ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pxor %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pxor %xmm1, %xmm4 +; SSE-NEXT: pxor %xmm7, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pcmpgtd %xmm4, %xmm1 ; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pxor %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm7, %xmm1 -; SSE-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 @@ -1003,8 +1003,8 @@ ; AVX-LABEL: reassociate_umax_v8i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxud %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpmaxud %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vpmaxud %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <8 x i32> %x0, %x1 @@ -1018,64 +1018,64 @@ define <4 x i64> @reassociate_umax_v4i64(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, <4 x i64> %x3) { ; SSE-LABEL: reassociate_umax_v4i64: ; SSE: # %bb.0: -; SSE-NEXT: paddq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE-NEXT: paddq %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm8, %xmm3 +; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pxor %xmm8, %xmm3 ; SSE-NEXT: pxor %xmm8, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pxor %xmm8, %xmm2 +; SSE-NEXT: pxor %xmm8, %xmm1 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm8, %xmm1 ; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pxor %xmm8, %xmm1 ; SSE-NEXT: pxor %xmm7, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm1, %xmm8 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm5 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm1 @@ -1099,8 +1099,8 @@ ; AVX512-LABEL: reassociate_umax_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmaxuq %ymm3, %ymm2, %ymm1 -; AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmaxuq %ymm3, %ymm2, %ymm2 +; AVX512-NEXT: vpmaxuq %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: retq %t0 = add <4 x i64> %x0, %x1 @@ -1114,35 +1114,35 @@ define <32 x i8> @reassociate_smax_v32i8(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, <32 x i8> %x3) { ; SSE-LABEL: reassociate_smax_v32i8: ; SSE: # %bb.0: -; SSE-NEXT: paddb %xmm2, %xmm0 ; SSE-NEXT: paddb %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: paddb %xmm2, %xmm0 +; SSE-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pcmpgtb %xmm1, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE-NEXT: pcmpgtb %xmm3, %xmm1 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_smax_v32i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxsb %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpmaxsb %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vpmaxsb %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <32 x i8> %x0, %x1 @@ -1157,18 +1157,18 @@ ; SSE-LABEL: reassociate_smax_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: paddw %xmm2, %xmm0 -; SSE-NEXT: paddw %xmm3, %xmm1 ; SSE-NEXT: pmaxsw %xmm6, %xmm4 -; SSE-NEXT: pmaxsw %xmm4, %xmm0 +; SSE-NEXT: paddw %xmm3, %xmm1 ; SSE-NEXT: pmaxsw %xmm7, %xmm5 +; SSE-NEXT: pmaxsw %xmm4, %xmm0 ; SSE-NEXT: pmaxsw %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_smax_v16i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxsw %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpmaxsw %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <16 x i16> %x0, %x1 @@ -1182,35 +1182,35 @@ define <8 x i32> @reassociate_smax_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, <8 x i32> %x3) { ; SSE-LABEL: reassociate_smax_v8i32: ; SSE: # %bb.0: -; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: paddd %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_smax_v8i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxsd %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpmaxsd %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <8 x i32> %x0, %x1 @@ -1224,64 +1224,64 @@ define <4 x i64> @reassociate_smax_v4i64(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, <4 x i64> %x3) { ; SSE-LABEL: reassociate_smax_v4i64: ; SSE: # %bb.0: -; SSE-NEXT: paddq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE-NEXT: paddq %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm8, %xmm3 +; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pxor %xmm8, %xmm3 ; SSE-NEXT: pxor %xmm8, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pxor %xmm8, %xmm2 +; SSE-NEXT: pxor %xmm8, %xmm1 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm8, %xmm1 ; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pxor %xmm8, %xmm1 ; SSE-NEXT: pxor %xmm7, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm1, %xmm8 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm5 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm1 @@ -1300,8 +1300,8 @@ ; AVX512-LABEL: reassociate_smax_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmaxsq %ymm3, %ymm2, %ymm1 -; AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmaxsq %ymm3, %ymm2, %ymm2 +; AVX512-NEXT: vpmaxsq %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: retq %t0 = add <4 x i64> %x0, %x1 @@ -1316,18 +1316,18 @@ ; SSE-LABEL: reassociate_umin_v32i8: ; SSE: # %bb.0: ; SSE-NEXT: paddb %xmm2, %xmm0 -; SSE-NEXT: paddb %xmm3, %xmm1 ; SSE-NEXT: pminub %xmm6, %xmm4 -; SSE-NEXT: pminub %xmm4, %xmm0 +; SSE-NEXT: paddb %xmm3, %xmm1 ; SSE-NEXT: pminub %xmm7, %xmm5 +; SSE-NEXT: pminub %xmm4, %xmm0 ; SSE-NEXT: pminub %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_umin_v32i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminub %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpminub %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vpminub %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <32 x i8> %x0, %x1 @@ -1341,28 +1341,28 @@ define <16 x i16> @reassociate_umin_v16i16(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, <16 x i16> %x3) { ; SSE-LABEL: reassociate_umin_v16i16: ; SSE: # %bb.0: -; SSE-NEXT: paddw %xmm2, %xmm0 ; SSE-NEXT: paddw %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE-NEXT: pxor %xmm2, %xmm5 -; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE-NEXT: paddw %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm5 +; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: pxor %xmm3, %xmm4 +; SSE-NEXT: pxor %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm6 +; SSE-NEXT: pxor %xmm3, %xmm7 ; SSE-NEXT: pminsw %xmm5, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm4 -; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pminsw %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm6 ; SSE-NEXT: pminsw %xmm6, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm7 ; SSE-NEXT: pminsw %xmm7, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_umin_v16i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminuw %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpminuw %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vpminuw %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <16 x i16> %x0, %x1 @@ -1376,37 +1376,37 @@ define <8 x i32> @reassociate_umin_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, <8 x i32> %x3) { ; SSE-LABEL: reassociate_umin_v8i32: ; SSE: # %bb.0: -; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] ; SSE-NEXT: paddd %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: pxor %xmm3, %xmm8 +; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm3, %xmm2 +; SSE-NEXT: pxor %xmm9, %xmm8 +; SSE-NEXT: pxor %xmm9, %xmm2 ; SSE-NEXT: pcmpgtd %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm8 ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pxor %xmm3, %xmm5 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pxor %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm9, %xmm8 +; SSE-NEXT: pxor %xmm9, %xmm1 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: pcmpgtd %xmm1, %xmm8 ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pxor %xmm9, %xmm1 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pxor %xmm9, %xmm0 ; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm9, %xmm1 +; SSE-NEXT: pxor %xmm7, %xmm9 ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pcmpgtd %xmm9, %xmm1 ; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm7, %xmm3 -; SSE-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 @@ -1415,8 +1415,8 @@ ; AVX-LABEL: reassociate_umin_v8i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminud %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpminud %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vpminud %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <8 x i32> %x0, %x1 @@ -1430,64 +1430,64 @@ define <4 x i64> @reassociate_umin_v4i64(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, <4 x i64> %x3) { ; SSE-LABEL: reassociate_umin_v4i64: ; SSE: # %bb.0: -; SSE-NEXT: paddq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE-NEXT: paddq %xmm3, %xmm1 +; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm8, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pxor %xmm8, %xmm2 +; SSE-NEXT: pxor %xmm8, %xmm3 ; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE-NEXT: pcmpgtd %xmm3, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pxor %xmm8, %xmm2 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: pxor %xmm8, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm8, %xmm1 ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pxor %xmm8, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm7, %xmm8 ; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pcmpgtd %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm1, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm5 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm1 @@ -1511,8 +1511,8 @@ ; AVX512-LABEL: reassociate_umin_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpminuq %ymm3, %ymm2, %ymm1 -; AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpminuq %ymm3, %ymm2, %ymm2 +; AVX512-NEXT: vpminuq %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: retq %t0 = add <4 x i64> %x0, %x1 @@ -1526,15 +1526,15 @@ define <32 x i8> @reassociate_smin_v32i8(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, <32 x i8> %x3) { ; SSE-LABEL: reassociate_smin_v32i8: ; SSE: # %bb.0: -; SSE-NEXT: paddb %xmm2, %xmm0 ; SSE-NEXT: paddb %xmm3, %xmm1 +; SSE-NEXT: paddb %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pcmpgtb %xmm5, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pcmpgtb %xmm4, %xmm1 +; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 @@ -1542,9 +1542,9 @@ ; SSE-NEXT: pcmpgtb %xmm6, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pcmpgtb %xmm7, %xmm1 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 @@ -1553,8 +1553,8 @@ ; AVX-LABEL: reassociate_smin_v32i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminsb %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpminsb %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vpminsb %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <32 x i8> %x0, %x1 @@ -1569,18 +1569,18 @@ ; SSE-LABEL: reassociate_smin_v16i16: ; SSE: # %bb.0: ; SSE-NEXT: paddw %xmm2, %xmm0 -; SSE-NEXT: paddw %xmm3, %xmm1 ; SSE-NEXT: pminsw %xmm6, %xmm4 -; SSE-NEXT: pminsw %xmm4, %xmm0 +; SSE-NEXT: paddw %xmm3, %xmm1 ; SSE-NEXT: pminsw %xmm7, %xmm5 +; SSE-NEXT: pminsw %xmm4, %xmm0 ; SSE-NEXT: pminsw %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_smin_v16i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminsw %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpminsw %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vpminsw %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <16 x i16> %x0, %x1 @@ -1594,15 +1594,15 @@ define <8 x i32> @reassociate_smin_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, <8 x i32> %x3) { ; SSE-LABEL: reassociate_smin_v8i32: ; SSE: # %bb.0: -; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pcmpgtd %xmm5, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 @@ -1610,9 +1610,9 @@ ; SSE-NEXT: pcmpgtd %xmm6, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pcmpgtd %xmm7, %xmm1 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 @@ -1621,8 +1621,8 @@ ; AVX-LABEL: reassociate_smin_v8i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminsd %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpminsd %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vpminsd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <8 x i32> %x0, %x1 @@ -1636,64 +1636,64 @@ define <4 x i64> @reassociate_smin_v4i64(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, <4 x i64> %x3) { ; SSE-LABEL: reassociate_smin_v4i64: ; SSE: # %bb.0: -; SSE-NEXT: paddq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE-NEXT: paddq %xmm3, %xmm1 +; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm8, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pxor %xmm8, %xmm2 +; SSE-NEXT: pxor %xmm8, %xmm3 ; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE-NEXT: pcmpgtd %xmm3, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pxor %xmm8, %xmm2 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: pxor %xmm8, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm8, %xmm1 ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pxor %xmm8, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm7, %xmm8 ; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pcmpgtd %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm1, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm5 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm1 @@ -1712,8 +1712,8 @@ ; AVX512-LABEL: reassociate_smin_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpminsq %ymm3, %ymm2, %ymm1 -; AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpminsq %ymm3, %ymm2, %ymm2 +; AVX512-NEXT: vpminsq %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: retq %t0 = add <4 x i64> %x0, %x1 @@ -1745,19 +1745,19 @@ ; ; AVX2-LABEL: reassociate_umax_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxub %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpmaxub %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxub %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpmaxub %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxub %ymm7, %ymm5, %ymm3 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxub %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpmaxub %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxub %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umax_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxub %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxub %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmaxub %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <64 x i8> %x0, %x1 @@ -1772,59 +1772,59 @@ ; SSE-LABEL: reassociate_umax_v32i16: ; SSE: # %bb.0: ; SSE-NEXT: paddw %xmm4, %xmm0 -; SSE-NEXT: paddw %xmm5, %xmm1 ; SSE-NEXT: paddw %xmm6, %xmm2 -; SSE-NEXT: paddw %xmm7, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: paddw %xmm7, %xmm3 +; SSE-NEXT: paddw %xmm5, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pxor %xmm4, %xmm5 -; SSE-NEXT: pmaxsw %xmm3, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm6 ; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm1 +; SSE-NEXT: pxor %xmm4, %xmm0 +; SSE-NEXT: pmaxsw %xmm3, %xmm6 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: pmaxsw %xmm2, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pmaxsw %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: pmaxsw %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: pmaxsw %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: pmaxsw %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pmaxsw %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: pmaxsw %xmm5, %xmm3 +; SSE-NEXT: pmaxsw %xmm6, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_umax_v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxuw %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpmaxuw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxuw %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpmaxuw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxuw %ymm7, %ymm5, %ymm3 +; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxuw %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpmaxuw %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxuw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umax_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxuw %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxuw %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmaxuw %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <32 x i16> %x0, %x1 @@ -1840,100 +1840,100 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm7, %xmm9 ; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: paddd %xmm7, %xmm0 -; SSE-NEXT: paddd %xmm5, %xmm1 ; SSE-NEXT: paddd %xmm6, %xmm2 +; SSE-NEXT: paddd %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: paddd %xmm9, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pxor %xmm3, %xmm6 +; SSE-NEXT: paddd %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pxor %xmm3, %xmm5 +; SSE-NEXT: pxor %xmm3, %xmm6 ; SSE-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm5 ; SSE-NEXT: pand %xmm6, %xmm4 ; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm5 ; SSE-NEXT: pxor %xmm3, %xmm5 +; SSE-NEXT: por %xmm4, %xmm6 ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pxor %xmm3, %xmm4 ; SSE-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pand %xmm5, %xmm15 ; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: por %xmm15, %xmm5 ; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pxor %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pxor %xmm3, %xmm4 +; SSE-NEXT: pxor %xmm3, %xmm2 +; SSE-NEXT: por %xmm15, %xmm5 ; SSE-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pand %xmm2, %xmm14 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm14, %xmm2 ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pxor %xmm3, %xmm4 +; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: por %xmm14, %xmm2 ; SSE-NEXT: pcmpgtd %xmm4, %xmm1 ; SSE-NEXT: pand %xmm1, %xmm13 ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: pxor %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pxor %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pxor %xmm3, %xmm0 ; SSE-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pand %xmm0, %xmm12 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pxor %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pxor %xmm3, %xmm4 ; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: por %xmm12, %xmm0 ; SSE-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: pand %xmm1, %xmm11 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pxor %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pxor %xmm3, %xmm4 ; SSE-NEXT: pxor %xmm3, %xmm2 +; SSE-NEXT: por %xmm11, %xmm1 ; SSE-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pxor %xmm3, %xmm4 +; SSE-NEXT: pxor %xmm7, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm10 ; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pcmpgtd %xmm4, %xmm3 ; SSE-NEXT: por %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pxor %xmm3, %xmm4 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pxor %xmm5, %xmm3 -; SSE-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_umax_v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxud %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxud %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpmaxud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxud %ymm7, %ymm5, %ymm3 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxud %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpmaxud %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxud %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umax_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxud %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxud %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmaxud %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -1949,153 +1949,153 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: paddq %xmm4, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: paddq %xmm6, %xmm2 ; SSE-NEXT: paddq %xmm7, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pxor %xmm4, %xmm5 +; SSE-NEXT: paddq %xmm6, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm5 ; SSE-NEXT: movdqa %xmm5, %xmm7 ; SSE-NEXT: pcmpgtd %xmm6, %xmm7 ; SSE-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] ; SSE-NEXT: pand %xmm5, %xmm6 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pand %xmm9, %xmm8 ; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: pxor %xmm4, %xmm6 ; SSE-NEXT: por %xmm8, %xmm9 ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pxor %xmm4, %xmm6 ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pcmpgtd %xmm6, %xmm7 ; SSE-NEXT: pcmpeqd %xmm3, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm6, %xmm8 ; SSE-NEXT: pand %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm8 ; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pxor %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pcmpgtd %xmm5, %xmm6 ; SSE-NEXT: pcmpeqd %xmm2, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] ; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm1 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pcmpgtd %xmm3, %xmm5 ; SSE-NEXT: pcmpeqd %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE-NEXT: por %xmm3, %xmm5 ; SSE-NEXT: pand %xmm5, %xmm7 ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE-NEXT: pcmpgtd %xmm1, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pcmpgtd %xmm2, %xmm5 ; SSE-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE-NEXT: pcmpgtd %xmm2, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm4 +; SSE-NEXT: pxor %xmm7, %xmm4 +; SSE-NEXT: por %xmm6, %xmm2 ; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pcmpgtd %xmm3, %xmm6 ; SSE-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; SSE-NEXT: pcmpgtd %xmm3, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_umax_v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm3, %ymm5, %ymm8 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm9 +; AVX2-NEXT: vpxor %ymm3, %ymm4, %ymm2 ; AVX2-NEXT: vpcmpgtq %ymm9, %ymm8, %ymm8 ; AVX2-NEXT: vblendvpd %ymm8, %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm3, %ymm4, %ymm2 -; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm5 -; AVX2-NEXT: vpcmpgtq %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm8 +; AVX2-NEXT: vpcmpgtq %ymm8, %ymm2, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vxorpd %ymm3, %ymm0, %ymm2 ; AVX2-NEXT: vpxor %ymm3, %ymm6, %ymm4 +; AVX2-NEXT: vxorpd %ymm3, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm6, %ymm0, %ymm0 ; AVX2-NEXT: vxorpd %ymm3, %ymm1, %ymm2 @@ -2107,8 +2107,8 @@ ; AVX512-LABEL: reassociate_umax_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxuq %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxuq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmaxuq %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <8 x i64> %x0, %x1 @@ -2122,55 +2122,55 @@ define <64 x i8> @reassociate_smax_v64i8(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, <64 x i8> %x3) { ; SSE-LABEL: reassociate_smax_v64i8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 ; SSE-NEXT: paddb %xmm4, %xmm0 -; SSE-NEXT: paddb %xmm5, %xmm1 -; SSE-NEXT: paddb %xmm6, %xmm2 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 ; SSE-NEXT: paddb %xmm7, %xmm3 +; SSE-NEXT: paddb %xmm6, %xmm2 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: paddb %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: pcmpgtb %xmm3, %xmm4 ; SSE-NEXT: pand %xmm4, %xmm15 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm15, %xmm4 ; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pcmpgtb %xmm2, %xmm3 +; SSE-NEXT: por %xmm15, %xmm4 ; SSE-NEXT: pand %xmm3, %xmm14 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm14, %xmm3 ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE-NEXT: por %xmm14, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm13 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm13, %xmm2 ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE-NEXT: por %xmm13, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm12 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: pcmpgtb %xmm1, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm11 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm10 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pcmpgtb %xmm3, %xmm2 +; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pcmpgtb %xmm4, %xmm3 +; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: por %xmm8, %xmm3 @@ -2178,19 +2178,19 @@ ; ; AVX2-LABEL: reassociate_smax_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxsb %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpmaxsb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxsb %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpmaxsb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxsb %ymm7, %ymm5, %ymm3 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsb %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpmaxsb %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smax_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxsb %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsb %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmaxsb %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <64 x i8> %x0, %x1 @@ -2220,19 +2220,19 @@ ; ; AVX2-LABEL: reassociate_smax_v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxsw %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxsw %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxsw %ymm7, %ymm5, %ymm3 +; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsw %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpmaxsw %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smax_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxsw %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsw %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmaxsw %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <32 x i16> %x0, %x1 @@ -2246,55 +2246,55 @@ define <16 x i32> @reassociate_smax_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, <16 x i32> %x3) { ; SSE-LABEL: reassociate_smax_v16i32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 ; SSE-NEXT: paddd %xmm4, %xmm0 -; SSE-NEXT: paddd %xmm5, %xmm1 -; SSE-NEXT: paddd %xmm6, %xmm2 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 ; SSE-NEXT: paddd %xmm7, %xmm3 +; SSE-NEXT: paddd %xmm6, %xmm2 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: paddd %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE-NEXT: pand %xmm4, %xmm15 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm15, %xmm4 ; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE-NEXT: por %xmm15, %xmm4 ; SSE-NEXT: pand %xmm3, %xmm14 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm14, %xmm3 ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE-NEXT: por %xmm14, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm13 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm13, %xmm2 ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE-NEXT: por %xmm13, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm12 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm11 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm10 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: por %xmm8, %xmm3 @@ -2302,19 +2302,19 @@ ; ; AVX2-LABEL: reassociate_smax_v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxsd %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxsd %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxsd %ymm7, %ymm5, %ymm3 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsd %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpmaxsd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smax_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxsd %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsd %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -2330,157 +2330,157 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: paddq %xmm4, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: paddq %xmm6, %xmm2 ; SSE-NEXT: paddq %xmm7, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pxor %xmm4, %xmm5 +; SSE-NEXT: paddq %xmm6, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm5 ; SSE-NEXT: movdqa %xmm5, %xmm7 ; SSE-NEXT: pcmpgtd %xmm6, %xmm7 ; SSE-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] ; SSE-NEXT: pand %xmm5, %xmm6 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pand %xmm9, %xmm8 ; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: pxor %xmm4, %xmm6 ; SSE-NEXT: por %xmm8, %xmm9 ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pxor %xmm4, %xmm6 ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pcmpgtd %xmm6, %xmm7 ; SSE-NEXT: pcmpeqd %xmm3, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm6, %xmm8 ; SSE-NEXT: pand %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm8 ; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pxor %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pcmpgtd %xmm5, %xmm6 ; SSE-NEXT: pcmpeqd %xmm2, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] ; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm1 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pcmpgtd %xmm3, %xmm5 ; SSE-NEXT: pcmpeqd %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE-NEXT: por %xmm3, %xmm5 ; SSE-NEXT: pand %xmm5, %xmm7 ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE-NEXT: pcmpgtd %xmm1, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pcmpgtd %xmm2, %xmm5 ; SSE-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE-NEXT: pcmpgtd %xmm2, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm4 +; SSE-NEXT: pxor %xmm7, %xmm4 +; SSE-NEXT: por %xmm6, %xmm2 ; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pcmpgtd %xmm3, %xmm6 ; SSE-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; SSE-NEXT: pcmpgtd %xmm3, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_smax_v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm5, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm5, %ymm3 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm2 +; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vblendvpd %ymm2, %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm6, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm7, %ymm3 ; AVX2-NEXT: vblendvpd %ymm2, %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm7, %ymm2 -; AVX2-NEXT: vblendvpd %ymm2, %ymm7, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm3, %ymm7, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smax_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxsq %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <8 x i64> %x0, %x1 @@ -2510,19 +2510,19 @@ ; ; AVX2-LABEL: reassociate_umin_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminub %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpminub %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminub %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpminub %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminub %ymm7, %ymm5, %ymm3 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpminub %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpminub %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpminub %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umin_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminub %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminub %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpminub %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <64 x i8> %x0, %x1 @@ -2537,59 +2537,59 @@ ; SSE-LABEL: reassociate_umin_v32i16: ; SSE: # %bb.0: ; SSE-NEXT: paddw %xmm4, %xmm0 -; SSE-NEXT: paddw %xmm5, %xmm1 ; SSE-NEXT: paddw %xmm6, %xmm2 -; SSE-NEXT: paddw %xmm7, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: paddw %xmm7, %xmm3 +; SSE-NEXT: paddw %xmm5, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pxor %xmm4, %xmm5 -; SSE-NEXT: pminsw %xmm3, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm6 ; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm1 +; SSE-NEXT: pxor %xmm4, %xmm0 +; SSE-NEXT: pminsw %xmm3, %xmm6 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: pminsw %xmm2, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pminsw %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: pminsw %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: pminsw %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: pminsw %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pminsw %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: pminsw %xmm5, %xmm3 +; SSE-NEXT: pminsw %xmm6, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_umin_v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminuw %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpminuw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminuw %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpminuw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminuw %ymm7, %ymm5, %ymm3 +; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpminuw %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpminuw %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpminuw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umin_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminuw %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminuw %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpminuw %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <32 x i16> %x0, %x1 @@ -2603,78 +2603,78 @@ define <16 x i32> @reassociate_umin_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, <16 x i32> %x3) { ; SSE-LABEL: reassociate_umin_v16i32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: paddd %xmm4, %xmm0 ; SSE-NEXT: paddd %xmm5, %xmm1 -; SSE-NEXT: paddd %xmm6, %xmm2 -; SSE-NEXT: paddd %xmm7, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pxor %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pxor %xmm5, %xmm4 -; SSE-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE-NEXT: pand %xmm4, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm15, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: paddd %xmm7, %xmm3 +; SSE-NEXT: paddd %xmm4, %xmm0 +; SSE-NEXT: paddd %xmm6, %xmm2 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pxor %xmm5, %xmm4 ; SSE-NEXT: pxor %xmm5, %xmm6 +; SSE-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pxor %xmm5, %xmm6 ; SSE-NEXT: pxor %xmm5, %xmm3 +; SSE-NEXT: por %xmm15, %xmm4 ; SSE-NEXT: pcmpgtd %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm6 ; SSE-NEXT: pand %xmm3, %xmm14 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm14, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: pxor %xmm5, %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pxor %xmm5, %xmm6 ; SSE-NEXT: pxor %xmm5, %xmm2 +; SSE-NEXT: por %xmm14, %xmm3 ; SSE-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: pand %xmm2, %xmm13 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm13, %xmm2 ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pxor %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: pxor %xmm5, %xmm6 +; SSE-NEXT: pxor %xmm5, %xmm1 +; SSE-NEXT: por %xmm13, %xmm2 ; SSE-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm12 ; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pxor %xmm5, %xmm1 ; SSE-NEXT: por %xmm12, %xmm6 ; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pxor %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pxor %xmm5, %xmm1 ; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm11 ; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pxor %xmm5, %xmm1 ; SSE-NEXT: pxor %xmm5, %xmm6 +; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: pcmpgtd %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: pand %xmm1, %xmm10 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: pxor %xmm5, %xmm6 +; SSE-NEXT: pxor %xmm5, %xmm2 +; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: pcmpgtd %xmm6, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pxor %xmm5, %xmm3 ; SSE-NEXT: pxor %xmm8, %xmm5 +; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: pcmpgtd %xmm5, %xmm3 ; SSE-NEXT: pand %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm4, %xmm3 @@ -2683,19 +2683,19 @@ ; ; AVX2-LABEL: reassociate_umin_v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminud %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminud %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminud %ymm7, %ymm5, %ymm3 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpminud %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpminud %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umin_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminud %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminud %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpminud %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -2709,112 +2709,113 @@ define <8 x i64> @reassociate_umin_v8i64(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, <8 x i64> %x3) { ; SSE-LABEL: reassociate_umin_v8i64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: paddq %xmm6, %xmm2 -; SSE-NEXT: paddq %xmm7, %xmm3 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pxor %xmm4, %xmm5 +; SSE-NEXT: paddq %xmm7, %xmm3 +; SSE-NEXT: paddq %xmm6, %xmm2 +; SSE-NEXT: paddq %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm5 ; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pcmpgtd %xmm5, %xmm7 ; SSE-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE-NEXT: pcmpgtd %xmm5, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] ; SSE-NEXT: pand %xmm5, %xmm6 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pand %xmm9, %xmm8 ; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm7 ; SSE-NEXT: por %xmm8, %xmm9 ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pxor %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pcmpgtd %xmm3, %xmm7 ; SSE-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE-NEXT: pcmpgtd %xmm3, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm6, %xmm8 ; SSE-NEXT: pand %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm8 ; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pxor %xmm4, %xmm5 ; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE-NEXT: pcmpgtd %xmm2, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] ; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm1 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 ; SSE-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] +; SSE-NEXT: pcmpgtd %xmm1, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE-NEXT: por %xmm3, %xmm5 ; SSE-NEXT: pand %xmm5, %xmm7 ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: pcmpgtd %xmm3, %xmm6 ; SSE-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pcmpgtd %xmm3, %xmm5 ; SSE-NEXT: pcmpeqd %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pcmpgtd %xmm3, %xmm5 ; SSE-NEXT: pcmpeqd %xmm2, %xmm3 @@ -2823,13 +2824,12 @@ ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm4 +; SSE-NEXT: pxor %xmm7, %xmm4 +; SSE-NEXT: por %xmm6, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE-NEXT: pcmpeqd %xmm3, %xmm4 @@ -2838,26 +2838,26 @@ ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_umin_v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm3, %ymm5, %ymm8 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm9 +; AVX2-NEXT: vpxor %ymm3, %ymm4, %ymm2 ; AVX2-NEXT: vpcmpgtq %ymm8, %ymm9, %ymm8 ; AVX2-NEXT: vblendvpd %ymm8, %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm3, %ymm4, %ymm2 -; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm5 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm8 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm8, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vxorpd %ymm3, %ymm0, %ymm2 ; AVX2-NEXT: vpxor %ymm3, %ymm6, %ymm4 +; AVX2-NEXT: vxorpd %ymm3, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm6, %ymm0, %ymm0 ; AVX2-NEXT: vxorpd %ymm3, %ymm1, %ymm2 @@ -2869,8 +2869,8 @@ ; AVX512-LABEL: reassociate_umin_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminuq %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminuq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpminuq %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <8 x i64> %x0, %x1 @@ -2884,35 +2884,35 @@ define <64 x i8> @reassociate_smin_v64i8(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, <64 x i8> %x3) { ; SSE-LABEL: reassociate_smin_v64i8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: paddb %xmm7, %xmm3 ; SSE-NEXT: paddb %xmm4, %xmm0 -; SSE-NEXT: paddb %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 ; SSE-NEXT: paddb %xmm6, %xmm2 -; SSE-NEXT: paddb %xmm7, %xmm3 +; SSE-NEXT: paddb %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pcmpgtb %xmm15, %xmm4 ; SSE-NEXT: pand %xmm4, %xmm15 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm15, %xmm4 ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pcmpgtb %xmm14, %xmm3 +; SSE-NEXT: por %xmm15, %xmm4 ; SSE-NEXT: pand %xmm3, %xmm14 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm14, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pcmpgtb %xmm13, %xmm2 +; SSE-NEXT: por %xmm14, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm13 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm13, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pcmpgtb %xmm12, %xmm1 +; SSE-NEXT: por %xmm13, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm12 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: por %xmm12, %xmm1 @@ -2920,19 +2920,19 @@ ; SSE-NEXT: pcmpgtb %xmm11, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm11 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pcmpgtb %xmm10, %xmm1 +; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm10 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pcmpgtb %xmm9, %xmm2 +; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pcmpgtb %xmm8, %xmm3 +; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: por %xmm8, %xmm3 @@ -2940,19 +2940,19 @@ ; ; AVX2-LABEL: reassociate_smin_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminsb %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpminsb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminsb %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpminsb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminsb %ymm7, %ymm5, %ymm3 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpminsb %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpminsb %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpminsb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smin_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminsb %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminsb %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpminsb %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <64 x i8> %x0, %x1 @@ -2982,19 +2982,19 @@ ; ; AVX2-LABEL: reassociate_smin_v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminsw %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpminsw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminsw %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpminsw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminsw %ymm7, %ymm5, %ymm3 +; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpminsw %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpminsw %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpminsw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smin_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminsw %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminsw %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpminsw %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <32 x i16> %x0, %x1 @@ -3008,35 +3008,35 @@ define <16 x i32> @reassociate_smin_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, <16 x i32> %x3) { ; SSE-LABEL: reassociate_smin_v16i32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: paddd %xmm7, %xmm3 ; SSE-NEXT: paddd %xmm4, %xmm0 -; SSE-NEXT: paddd %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 ; SSE-NEXT: paddd %xmm6, %xmm2 -; SSE-NEXT: paddd %xmm7, %xmm3 +; SSE-NEXT: paddd %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pcmpgtd %xmm15, %xmm4 ; SSE-NEXT: pand %xmm4, %xmm15 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm15, %xmm4 ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pcmpgtd %xmm14, %xmm3 +; SSE-NEXT: por %xmm15, %xmm4 ; SSE-NEXT: pand %xmm3, %xmm14 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm14, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pcmpgtd %xmm13, %xmm2 +; SSE-NEXT: por %xmm14, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm13 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm13, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pcmpgtd %xmm12, %xmm1 +; SSE-NEXT: por %xmm13, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm12 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: por %xmm12, %xmm1 @@ -3044,19 +3044,19 @@ ; SSE-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm11 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm10 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: por %xmm8, %xmm3 @@ -3064,19 +3064,19 @@ ; ; AVX2-LABEL: reassociate_smin_v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminsd %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpminsd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminsd %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpminsd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminsd %ymm7, %ymm5, %ymm3 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpminsd %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpminsd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpminsd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smin_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminsd %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminsd %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpminsd %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -3090,112 +3090,113 @@ define <8 x i64> @reassociate_smin_v8i64(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, <8 x i64> %x3) { ; SSE-LABEL: reassociate_smin_v8i64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: paddq %xmm6, %xmm2 -; SSE-NEXT: paddq %xmm7, %xmm3 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pxor %xmm4, %xmm5 +; SSE-NEXT: paddq %xmm7, %xmm3 +; SSE-NEXT: paddq %xmm6, %xmm2 +; SSE-NEXT: paddq %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm5 ; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pcmpgtd %xmm5, %xmm7 ; SSE-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE-NEXT: pcmpgtd %xmm5, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] ; SSE-NEXT: pand %xmm5, %xmm6 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pand %xmm9, %xmm8 ; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm7 ; SSE-NEXT: por %xmm8, %xmm9 ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pxor %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pcmpgtd %xmm3, %xmm7 ; SSE-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE-NEXT: pcmpgtd %xmm3, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm6, %xmm8 ; SSE-NEXT: pand %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm8 ; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pxor %xmm4, %xmm5 ; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm2 ; SSE-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE-NEXT: pcmpgtd %xmm2, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] ; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm1 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 ; SSE-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] +; SSE-NEXT: pcmpgtd %xmm1, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE-NEXT: por %xmm3, %xmm5 ; SSE-NEXT: pand %xmm5, %xmm7 ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: pcmpgtd %xmm3, %xmm6 ; SSE-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pcmpgtd %xmm3, %xmm5 ; SSE-NEXT: pcmpeqd %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pcmpgtd %xmm3, %xmm5 ; SSE-NEXT: pcmpeqd %xmm2, %xmm3 @@ -3204,13 +3205,12 @@ ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm4 +; SSE-NEXT: pxor %xmm7, %xmm4 +; SSE-NEXT: por %xmm6, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE-NEXT: pcmpeqd %xmm3, %xmm4 @@ -3219,30 +3219,30 @@ ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_smin_v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm5, %ymm1, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm5, %ymm1, %ymm3 ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm2 +; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vblendvpd %ymm2, %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpgtq %ymm6, %ymm0, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm7, %ymm1, %ymm3 ; AVX2-NEXT: vblendvpd %ymm2, %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm7, %ymm1, %ymm2 -; AVX2-NEXT: vblendvpd %ymm2, %ymm7, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm3, %ymm7, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smin_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminsq %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminsq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <8 x i64> %x0, %x1 diff --git a/llvm/test/CodeGen/X86/machine-combiner-int.ll b/llvm/test/CodeGen/X86/machine-combiner-int.ll --- a/llvm/test/CodeGen/X86/machine-combiner-int.ll +++ b/llvm/test/CodeGen/X86/machine-combiner-int.ll @@ -10,10 +10,10 @@ define i16 @reassociate_muls_i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3) { ; CHECK-LABEL: reassociate_muls_i16: ; CHECK: # %bb.0: +; CHECK-NEXT: imull %ecx, %edx ; CHECK-NEXT: # kill ; CHECK-NEXT: # kill ; CHECK-NEXT: leal (%rdi,%rsi), %eax -; CHECK-NEXT: imull %ecx, %edx ; CHECK-NEXT: imull %edx, %eax ; CHECK-NEXT: # kill ; CHECK-NEXT: retq @@ -26,10 +26,10 @@ define i32 @reassociate_muls_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) { ; CHECK-LABEL: reassociate_muls_i32: ; CHECK: # %bb.0: +; CHECK-NEXT: imull %ecx, %edx ; CHECK-NEXT: # kill ; CHECK-NEXT: # kill ; CHECK-NEXT: leal (%rdi,%rsi), %eax -; CHECK-NEXT: imull %ecx, %edx ; CHECK-NEXT: imull %edx, %eax ; CHECK-NEXT: retq @@ -46,8 +46,8 @@ define i64 @reassociate_muls_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) { ; CHECK-LABEL: reassociate_muls_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: leaq (%rdi,%rsi), %rax ; CHECK-NEXT: imulq %rcx, %rdx +; CHECK-NEXT: leaq (%rdi,%rsi), %rax ; CHECK-NEXT: imulq %rdx, %rax ; CHECK-NEXT: retq %t0 = add i64 %x0, %x1 diff --git a/llvm/test/CodeGen/X86/machine-combiner.ll b/llvm/test/CodeGen/X86/machine-combiner.ll --- a/llvm/test/CodeGen/X86/machine-combiner.ll +++ b/llvm/test/CodeGen/X86/machine-combiner.ll @@ -23,8 +23,8 @@ ; AVX-LABEL: reassociate_adds1: ; AVX: # %bb.0: ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = fadd reassoc nsz float %x0, %x1 %t1 = fadd reassoc nsz float %t0, %x2 @@ -43,8 +43,8 @@ ; AVX-LABEL: reassociate_adds2: ; AVX: # %bb.0: ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = fadd reassoc nsz float %x0, %x1 %t1 = fadd reassoc nsz float %x2, %t0 @@ -63,8 +63,8 @@ ; AVX-LABEL: reassociate_adds3: ; AVX: # %bb.0: ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = fadd reassoc nsz float %x0, %x1 %t1 = fadd reassoc nsz float %t0, %x2 @@ -83,8 +83,8 @@ ; AVX-LABEL: reassociate_adds4: ; AVX: # %bb.0: ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = fadd reassoc nsz float %x0, %x1 %t1 = fadd reassoc nsz float %x2, %t0 @@ -98,10 +98,10 @@ define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) { ; SSE-LABEL: reassociate_adds5: ; SSE: # %bb.0: +; SSE-NEXT: addss %xmm5, %xmm4 ; SSE-NEXT: addss %xmm1, %xmm0 ; SSE-NEXT: addss %xmm3, %xmm2 ; SSE-NEXT: addss %xmm2, %xmm0 -; SSE-NEXT: addss %xmm5, %xmm4 ; SSE-NEXT: addss %xmm6, %xmm4 ; SSE-NEXT: addss %xmm4, %xmm0 ; SSE-NEXT: addss %xmm7, %xmm0 @@ -279,16 +279,16 @@ define <4 x float> @reassociate_muls_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; SSE-LABEL: reassociate_muls_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: addps %xmm1, %xmm0 ; SSE-NEXT: mulps %xmm3, %xmm2 +; SSE-NEXT: addps %xmm1, %xmm0 ; SSE-NEXT: mulps %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_muls_v4f32: ; AVX: # %bb.0: +; AVX-NEXT: vmulps %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmulps %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulps %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = fadd reassoc nsz <4 x float> %x0, %x1 %t1 = fmul reassoc nsz <4 x float> %x2, %t0 @@ -301,16 +301,16 @@ define <2 x double> @reassociate_muls_v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) { ; SSE-LABEL: reassociate_muls_v2f64: ; SSE: # %bb.0: -; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: mulpd %xmm3, %xmm2 +; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: mulpd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_muls_v2f64: ; AVX: # %bb.0: +; AVX-NEXT: vmulpd %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmulpd %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulpd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = fadd reassoc nsz <2 x double> %x0, %x1 %t1 = fmul reassoc nsz <2 x double> %x2, %t0 @@ -326,8 +326,8 @@ ; SSE-NEXT: mulps %xmm2, %xmm0 ; SSE-NEXT: mulps %xmm3, %xmm1 ; SSE-NEXT: addps %xmm6, %xmm4 -; SSE-NEXT: addps %xmm4, %xmm0 ; SSE-NEXT: addps %xmm7, %xmm5 +; SSE-NEXT: addps %xmm4, %xmm0 ; SSE-NEXT: addps %xmm5, %xmm1 ; SSE-NEXT: retq ; @@ -357,8 +357,8 @@ ; SSE-NEXT: mulpd %xmm2, %xmm0 ; SSE-NEXT: mulpd %xmm3, %xmm1 ; SSE-NEXT: addpd %xmm6, %xmm4 -; SSE-NEXT: addpd %xmm4, %xmm0 ; SSE-NEXT: addpd %xmm7, %xmm5 +; SSE-NEXT: addpd %xmm4, %xmm0 ; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: retq ; @@ -385,19 +385,19 @@ define <8 x float> @reassociate_muls_v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, <8 x float> %x3) { ; SSE-LABEL: reassociate_muls_v8f32: ; SSE: # %bb.0: +; SSE-NEXT: mulps %xmm6, %xmm4 +; SSE-NEXT: mulps %xmm7, %xmm5 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: addps %xmm3, %xmm1 -; SSE-NEXT: mulps %xmm6, %xmm4 ; SSE-NEXT: mulps %xmm4, %xmm0 -; SSE-NEXT: mulps %xmm7, %xmm5 ; SSE-NEXT: mulps %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_muls_v8f32: ; AVX: # %bb.0: +; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = fadd reassoc nsz <8 x float> %x0, %x1 %t1 = fmul reassoc nsz <8 x float> %x2, %t0 @@ -410,19 +410,19 @@ define <4 x double> @reassociate_muls_v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, <4 x double> %x3) { ; SSE-LABEL: reassociate_muls_v4f64: ; SSE: # %bb.0: +; SSE-NEXT: mulpd %xmm6, %xmm4 +; SSE-NEXT: mulpd %xmm7, %xmm5 ; SSE-NEXT: addpd %xmm2, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm1 -; SSE-NEXT: mulpd %xmm6, %xmm4 ; SSE-NEXT: mulpd %xmm4, %xmm0 -; SSE-NEXT: mulpd %xmm7, %xmm5 ; SSE-NEXT: mulpd %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_muls_v4f64: ; AVX: # %bb.0: +; AVX-NEXT: vmulpd %ymm3, %ymm2, %ymm2 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmulpd %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmulpd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = fadd reassoc nsz <4 x double> %x0, %x1 %t1 = fmul reassoc nsz <4 x double> %x2, %t0 @@ -454,9 +454,9 @@ ; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vmulps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vaddps %ymm6, %ymm4, %ymm2 +; AVX1-NEXT: vaddps %ymm7, %ymm5, %ymm3 ; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vaddps %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vaddps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_adds_v16f32: @@ -494,9 +494,9 @@ ; AVX1-NEXT: vmulpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vmulpd %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vaddpd %ymm6, %ymm4, %ymm2 +; AVX1-NEXT: vaddpd %ymm7, %ymm5, %ymm3 ; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vaddpd %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_adds_v8f64: @@ -531,19 +531,19 @@ ; ; AVX1-LABEL: reassociate_muls_v16f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vaddps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmulps %ymm6, %ymm4, %ymm2 -; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vmulps %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vmulps %ymm7, %ymm5, %ymm3 +; AVX1-NEXT: vmulps %ymm6, %ymm4, %ymm4 +; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmulps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vmulps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_muls_v16f32: ; AVX512: # %bb.0: +; AVX512-NEXT: vmulps %zmm3, %zmm2, %zmm2 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vmulps %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmulps %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = fadd reassoc nsz <16 x float> %x0, %x1 %t1 = fmul reassoc nsz <16 x float> %x2, %t0 @@ -572,19 +572,19 @@ ; ; AVX1-LABEL: reassociate_muls_v8f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmulpd %ymm6, %ymm4, %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vmulpd %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vmulpd %ymm7, %ymm5, %ymm3 +; AVX1-NEXT: vmulpd %ymm6, %ymm4, %ymm4 +; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmulpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vmulpd %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_muls_v8f64: ; AVX512: # %bb.0: +; AVX512-NEXT: vmulpd %zmm3, %zmm2, %zmm2 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vmulpd %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmulpd %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = fadd reassoc nsz <8 x double> %x0, %x1 %t1 = fmul reassoc nsz <8 x double> %x2, %t0 @@ -701,8 +701,8 @@ ; AVX-LABEL: reassociate_mins_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vminps %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminps %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vminps %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = fadd <4 x float> %x0, %x1 %cmp1 = fcmp olt <4 x float> %x2, %t0 @@ -725,8 +725,8 @@ ; AVX-LABEL: reassociate_maxs_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmaxps %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxps %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vmaxps %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = fadd <4 x float> %x0, %x1 %cmp1 = fcmp ogt <4 x float> %x2, %t0 @@ -749,8 +749,8 @@ ; AVX-LABEL: reassociate_mins_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vminpd %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminpd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vminpd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = fadd <2 x double> %x0, %x1 %cmp1 = fcmp olt <2 x double> %x2, %t0 @@ -773,8 +773,8 @@ ; AVX-LABEL: reassociate_maxs_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmaxpd %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxpd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vmaxpd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = fadd <2 x double> %x0, %x1 %cmp1 = fcmp ogt <2 x double> %x2, %t0 @@ -790,18 +790,18 @@ ; SSE-LABEL: reassociate_mins_v8f32: ; SSE: # %bb.0: ; SSE-NEXT: addps %xmm2, %xmm0 -; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: minps %xmm6, %xmm4 -; SSE-NEXT: minps %xmm4, %xmm0 +; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: minps %xmm7, %xmm5 +; SSE-NEXT: minps %xmm4, %xmm0 ; SSE-NEXT: minps %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_mins_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vminps %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vminps %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vminps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = fadd <8 x float> %x0, %x1 %cmp1 = fcmp olt <8 x float> %x2, %t0 @@ -817,18 +817,18 @@ ; SSE-LABEL: reassociate_maxs_v8f32: ; SSE: # %bb.0: ; SSE-NEXT: addps %xmm2, %xmm0 -; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: maxps %xmm6, %xmm4 -; SSE-NEXT: maxps %xmm4, %xmm0 +; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: maxps %xmm7, %xmm5 +; SSE-NEXT: maxps %xmm4, %xmm0 ; SSE-NEXT: maxps %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_maxs_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmaxps %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmaxps %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vmaxps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = fadd <8 x float> %x0, %x1 %cmp1 = fcmp ogt <8 x float> %x2, %t0 @@ -844,18 +844,18 @@ ; SSE-LABEL: reassociate_mins_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: addpd %xmm2, %xmm0 -; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: minpd %xmm6, %xmm4 -; SSE-NEXT: minpd %xmm4, %xmm0 +; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: minpd %xmm7, %xmm5 +; SSE-NEXT: minpd %xmm4, %xmm0 ; SSE-NEXT: minpd %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_mins_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vminpd %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vminpd %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vminpd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = fadd <4 x double> %x0, %x1 %cmp1 = fcmp olt <4 x double> %x2, %t0 @@ -871,18 +871,18 @@ ; SSE-LABEL: reassociate_maxs_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: addpd %xmm2, %xmm0 -; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: maxpd %xmm6, %xmm4 -; SSE-NEXT: maxpd %xmm4, %xmm0 +; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: maxpd %xmm7, %xmm5 +; SSE-NEXT: maxpd %xmm4, %xmm0 ; SSE-NEXT: maxpd %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_maxs_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmaxpd %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmaxpd %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = fadd <4 x double> %x0, %x1 %cmp1 = fcmp ogt <4 x double> %x2, %t0 @@ -913,19 +913,19 @@ ; ; AVX1-LABEL: reassociate_mins_v16f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vaddps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vminps %ymm6, %ymm4, %ymm2 -; AVX1-NEXT: vminps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vminps %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vminps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vminps %ymm7, %ymm5, %ymm3 +; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vminps %ymm6, %ymm4, %ymm4 +; AVX1-NEXT: vminps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vminps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_mins_v16f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vminps %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vminps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vminps %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vminps %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = fadd <16 x float> %x0, %x1 %cmp1 = fcmp olt <16 x float> %x2, %t0 @@ -956,19 +956,19 @@ ; ; AVX1-LABEL: reassociate_maxs_v16f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vaddps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmaxps %ymm6, %ymm4, %ymm2 -; AVX1-NEXT: vmaxps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vmaxps %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vmaxps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vmaxps %ymm7, %ymm5, %ymm3 +; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmaxps %ymm6, %ymm4, %ymm4 +; AVX1-NEXT: vmaxps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vmaxps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_maxs_v16f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vmaxps %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmaxps %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vmaxps %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = fadd <16 x float> %x0, %x1 %cmp1 = fcmp ogt <16 x float> %x2, %t0 @@ -999,19 +999,19 @@ ; ; AVX1-LABEL: reassociate_mins_v8f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vminpd %ymm6, %ymm4, %ymm2 -; AVX1-NEXT: vminpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vminpd %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vminpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vminpd %ymm7, %ymm5, %ymm3 +; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vminpd %ymm6, %ymm4, %ymm4 +; AVX1-NEXT: vminpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vminpd %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_mins_v8f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vminpd %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vminpd %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vminpd %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = fadd <8 x double> %x0, %x1 %cmp1 = fcmp olt <8 x double> %x2, %t0 @@ -1042,19 +1042,19 @@ ; ; AVX1-LABEL: reassociate_maxs_v8f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmaxpd %ymm6, %ymm4, %ymm2 -; AVX1-NEXT: vmaxpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vmaxpd %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vmaxpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vmaxpd %ymm7, %ymm5, %ymm3 +; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmaxpd %ymm6, %ymm4, %ymm4 +; AVX1-NEXT: vmaxpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vmaxpd %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_maxs_v8f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vmaxpd %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmaxpd %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vmaxpd %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = fadd <8 x double> %x0, %x1 %cmp1 = fcmp ogt <8 x double> %x2, %t0 @@ -1083,32 +1083,52 @@ ; SSE-NEXT: callq bar ; SSE-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; SSE-NEXT: # xmm1 = mem[0],zero -; SSE-NEXT: addsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload ; SSE-NEXT: addsd (%rsp), %xmm0 # 8-byte Folded Reload +; SSE-NEXT: addsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq ; -; AVX-LABEL: reassociate_adds_from_calls: -; AVX: # %bb.0: -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: callq bar -; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX-NEXT: callq bar -; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX-NEXT: callq bar -; AVX-NEXT: vmovsd %xmm0, (%rsp) # 8-byte Spill -; AVX-NEXT: callq bar -; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload -; AVX-NEXT: # xmm1 = mem[0],zero -; AVX-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 8-byte Folded Reload -; AVX-NEXT: vaddsd (%rsp), %xmm0, %xmm0 # 8-byte Folded Reload -; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: reassociate_adds_from_calls: +; AVX1: # %bb.0: +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: callq bar +; AVX1-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: callq bar +; AVX1-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: callq bar +; AVX1-NEXT: vmovsd %xmm0, (%rsp) # 8-byte Spill +; AVX1-NEXT: callq bar +; AVX1-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; AVX1-NEXT: # xmm1 = mem[0],zero +; AVX1-NEXT: vaddsd (%rsp), %xmm0, %xmm0 # 8-byte Folded Reload +; AVX1-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 8-byte Folded Reload +; AVX1-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: reassociate_adds_from_calls: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: callq bar +; AVX512-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: callq bar +; AVX512-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: callq bar +; AVX512-NEXT: vmovsd %xmm0, (%rsp) # 8-byte Spill +; AVX512-NEXT: callq bar +; AVX512-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; AVX512-NEXT: # xmm1 = mem[0],zero +; AVX512-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 8-byte Folded Reload +; AVX512-NEXT: vaddsd (%rsp), %xmm0, %xmm0 # 8-byte Folded Reload +; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq %x0 = call double @bar() %x1 = call double @bar() @@ -1134,32 +1154,52 @@ ; SSE-NEXT: callq bar ; SSE-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; SSE-NEXT: # xmm1 = mem[0],zero -; SSE-NEXT: addsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload ; SSE-NEXT: addsd (%rsp), %xmm0 # 8-byte Folded Reload +; SSE-NEXT: addsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq ; -; AVX-LABEL: already_reassociated: -; AVX: # %bb.0: -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: callq bar -; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX-NEXT: callq bar -; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX-NEXT: callq bar -; AVX-NEXT: vmovsd %xmm0, (%rsp) # 8-byte Spill -; AVX-NEXT: callq bar -; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload -; AVX-NEXT: # xmm1 = mem[0],zero -; AVX-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 8-byte Folded Reload -; AVX-NEXT: vaddsd (%rsp), %xmm0, %xmm0 # 8-byte Folded Reload -; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: already_reassociated: +; AVX1: # %bb.0: +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: callq bar +; AVX1-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: callq bar +; AVX1-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: callq bar +; AVX1-NEXT: vmovsd %xmm0, (%rsp) # 8-byte Spill +; AVX1-NEXT: callq bar +; AVX1-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; AVX1-NEXT: # xmm1 = mem[0],zero +; AVX1-NEXT: vaddsd (%rsp), %xmm0, %xmm0 # 8-byte Folded Reload +; AVX1-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 8-byte Folded Reload +; AVX1-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: already_reassociated: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: callq bar +; AVX512-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: callq bar +; AVX512-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: callq bar +; AVX512-NEXT: vmovsd %xmm0, (%rsp) # 8-byte Spill +; AVX512-NEXT: callq bar +; AVX512-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; AVX512-NEXT: # xmm1 = mem[0],zero +; AVX512-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 8-byte Folded Reload +; AVX512-NEXT: vaddsd (%rsp), %xmm0, %xmm0 # 8-byte Folded Reload +; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq %x0 = call double @bar() %x1 = call double @bar() diff --git a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll --- a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll +++ b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll @@ -29,8 +29,8 @@ ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload ; CHECK-NEXT: # xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: mulss %xmm0, %xmm2 ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: mulss %xmm0, %xmm2 ; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: addss %xmm2, %xmm0 ; CHECK-NEXT: movss %xmm0, (%rax) diff --git a/llvm/test/CodeGen/X86/memcpy-2.ll b/llvm/test/CodeGen/X86/memcpy-2.ll --- a/llvm/test/CodeGen/X86/memcpy-2.ll +++ b/llvm/test/CodeGen/X86/memcpy-2.ll @@ -13,24 +13,24 @@ entry: ; SSE2-Darwin-LABEL: t1: ; SSE2-Darwin: movsd _.str+16, %xmm0 -; SSE2-Darwin: movsd %xmm0, 16(%esp) -; SSE2-Darwin: movaps _.str, %xmm0 -; SSE2-Darwin: movaps %xmm0 +; SSE2-Darwin: movaps _.str, %xmm1 ; SSE2-Darwin: movb $0, 24(%esp) +; SSE2-Darwin: movsd %xmm0, 16(%esp) +; SSE2-Darwin: movaps %xmm1 ; SSE2-Mingw32-LABEL: t1: ; SSE2-Mingw32: movsd _.str+16, %xmm0 -; SSE2-Mingw32: movsd %xmm0, 16(%esp) -; SSE2-Mingw32: movaps _.str, %xmm0 -; SSE2-Mingw32: movups %xmm0 +; SSE2-Mingw32: movaps _.str, %xmm1 ; SSE2-Mingw32: movb $0, 24(%esp) +; SSE2-Mingw32: movsd %xmm0, 16(%esp) +; SSE2-Mingw32: movups %xmm1 ; SSE1-LABEL: t1: ; SSE1: movaps _.str, %xmm0 -; SSE1: movaps %xmm0 ; SSE1: movb $0, 24(%esp) ; SSE1: movl $0, 20(%esp) ; SSE1: movl $0, 16(%esp) +; SSE1: movaps %xmm0 ; NOSSE-LABEL: t1: ; NOSSE: movb $0 @@ -43,9 +43,9 @@ ; X86-64-LABEL: t1: ; X86-64: movaps _.str(%rip), %xmm0 -; X86-64: movaps %xmm0 ; X86-64: movb $0 ; X86-64: movq $0 +; X86-64: movaps %xmm0 %tmp1 = alloca [25 x i8] %tmp2 = bitcast [25 x i8]* %tmp1 to i8* call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %tmp2, i8* align 1 getelementptr inbounds ([25 x i8], [25 x i8]* @.str, i32 0, i32 0), i32 25, i1 false) @@ -187,18 +187,18 @@ ;;; Is either of the sequences ideal? ; X86-64-LABEL: t4: +; X86-64: movabsq $8680820740569200760, %rcx ## imm = 0x7878787878787878 ; X86-64: movabsq $33909456017848440, %rax ## imm = 0x78787878787878 ; X86-64: movq %rax, -10(%rsp) -; X86-64: movabsq $8680820740569200760, %rax ## imm = 0x7878787878787878 -; X86-64: movq %rax, -16(%rsp) -; X86-64: movq %rax, -24(%rsp) -; X86-64: movq %rax, -32(%rsp) +; X86-64: movq %rcx, -16(%rsp) +; X86-64: movq %rcx, -24(%rsp) +; X86-64: movq %rcx, -32(%rsp) ; NHM_64-LABEL: t4: ; NHM_64: movups _.str2+14(%rip), %xmm0 +; NHM_64: movups _.str2(%rip), %xmm1 ; NHM_64: movups %xmm0, -26(%rsp) -; NHM_64: movups _.str2(%rip), %xmm0 -; NHM_64: movaps %xmm0, -40(%rsp) +; NHM_64: movaps %xmm1, -40(%rsp) %tmp1 = alloca [30 x i8] %tmp2 = bitcast [30 x i8]* %tmp1 to i8* diff --git a/llvm/test/CodeGen/X86/memcpy.ll b/llvm/test/CodeGen/X86/memcpy.ll --- a/llvm/test/CodeGen/X86/memcpy.ll +++ b/llvm/test/CodeGen/X86/memcpy.ll @@ -140,11 +140,6 @@ } define void @test3_pgso(i8* nocapture %A, i8* nocapture %B) nounwind noredzone !prof !14 { -; LINUX-LABEL: test3_pgso: -; LINUX: # %bb.0: # %entry -; LINUX-NEXT: movl $64, %edx -; LINUX-NEXT: jmp memcpy # TAILCALL -; ; DARWIN-LABEL: test3_pgso: ; DARWIN: ## %bb.0: ## %entry ; DARWIN-NEXT: movq 56(%rsi), %rax @@ -164,6 +159,42 @@ ; DARWIN-NEXT: movq %rcx, 8(%rdi) ; DARWIN-NEXT: movq %rax, (%rdi) ; DARWIN-NEXT: retq +; +; LINUX-LABEL: test3_pgso: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: movl $64, %edx +; LINUX-NEXT: jmp memcpy # TAILCALL +; +; LINUX-SKL-LABEL: test3_pgso: +; LINUX-SKL: # %bb.0: # %entry +; LINUX-SKL-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKL-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKL-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKL-NEXT: vzeroupper +; LINUX-SKL-NEXT: retq +; +; LINUX-SKX-LABEL: test3_pgso: +; LINUX-SKX: # %bb.0: # %entry +; LINUX-SKX-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKX-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKX-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKX-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKX-NEXT: vzeroupper +; LINUX-SKX-NEXT: retq +; +; LINUX-KNL-LABEL: test3_pgso: +; LINUX-KNL: # %bb.0: # %entry +; LINUX-KNL-NEXT: vmovups (%rsi), %zmm0 +; LINUX-KNL-NEXT: vmovups %zmm0, (%rdi) +; LINUX-KNL-NEXT: retq +; +; LINUX-AVX512BW-LABEL: test3_pgso: +; LINUX-AVX512BW: # %bb.0: # %entry +; LINUX-AVX512BW-NEXT: vmovups (%rsi), %zmm0 +; LINUX-AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; LINUX-AVX512BW-NEXT: vzeroupper +; LINUX-AVX512BW-NEXT: retq entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false) ret void @@ -347,17 +378,17 @@ ; DARWIN-LABEL: test5: ; DARWIN: ## %bb.0: ## %entry ; DARWIN-NEXT: movabsq $7016996765293437281, %rax ## imm = 0x6161616161616161 +; DARWIN-NEXT: movabsq $7016996765293437184, %rcx ## imm = 0x6161616161616100 ; DARWIN-NEXT: movq %rax, 8(%rdi) -; DARWIN-NEXT: movabsq $7016996765293437184, %rax ## imm = 0x6161616161616100 -; DARWIN-NEXT: movq %rax, (%rdi) +; DARWIN-NEXT: movq %rcx, (%rdi) ; DARWIN-NEXT: retq ; ; LINUX-LABEL: test5: ; LINUX: # %bb.0: # %entry ; LINUX-NEXT: movabsq $7016996765293437281, %rax # imm = 0x6161616161616161 +; LINUX-NEXT: movabsq $7016996765293437184, %rcx # imm = 0x6161616161616100 ; LINUX-NEXT: movq %rax, 8(%rdi) -; LINUX-NEXT: movabsq $7016996765293437184, %rax # imm = 0x6161616161616100 -; LINUX-NEXT: movq %rax, (%rdi) +; LINUX-NEXT: movq %rcx, (%rdi) ; LINUX-NEXT: retq ; ; LINUX-SKL-LABEL: test5: diff --git a/llvm/test/CodeGen/X86/memset-2.ll b/llvm/test/CodeGen/X86/memset-2.ll --- a/llvm/test/CodeGen/X86/memset-2.ll +++ b/llvm/test/CodeGen/X86/memset-2.ll @@ -34,8 +34,8 @@ define void @t3(i8* nocapture %s, i8 %a) nounwind { ; CHECK-LABEL: t3: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: imull $16843009, %ecx, %ecx ## imm = 0x1010101 ; CHECK-NEXT: movl %ecx, 4(%eax) ; CHECK-NEXT: movl %ecx, (%eax) @@ -48,8 +48,8 @@ define void @t4(i8* nocapture %s, i8 %a) nounwind { ; CHECK-LABEL: t4: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: imull $16843009, %ecx, %ecx ## imm = 0x1010101 ; CHECK-NEXT: movl %ecx, 11(%eax) ; CHECK-NEXT: movl %ecx, 8(%eax) diff --git a/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll b/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll --- a/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll +++ b/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll @@ -33,51 +33,97 @@ ; NOSSE-NEXT: popl %ebp ; NOSSE-NEXT: retl ; -; SSE-LABEL: test1: -; SSE: # %bb.0: -; SSE-NEXT: pushl %ebp -; SSE-NEXT: movl %esp, %ebp -; SSE-NEXT: pushl %esi -; SSE-NEXT: andl $-16, %esp -; SSE-NEXT: subl $48, %esp -; SSE-NEXT: movl %esp, %esi -; SSE-NEXT: movl 8(%ebp), %eax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, 16(%esi) -; SSE-NEXT: movaps %xmm0, (%esi) -; SSE-NEXT: addl $3, %eax -; SSE-NEXT: andl $-4, %eax -; SSE-NEXT: calll __alloca -; SSE-NEXT: movl %esp, %eax -; SSE-NEXT: pushl %eax -; SSE-NEXT: calll _dummy -; SSE-NEXT: leal -4(%ebp), %esp -; SSE-NEXT: popl %esi -; SSE-NEXT: popl %ebp -; SSE-NEXT: retl +; SSE1-LABEL: test1: +; SSE1: # %bb.0: +; SSE1-NEXT: pushl %ebp +; SSE1-NEXT: movl %esp, %ebp +; SSE1-NEXT: pushl %esi +; SSE1-NEXT: andl $-16, %esp +; SSE1-NEXT: subl $48, %esp +; SSE1-NEXT: movl %esp, %esi +; SSE1-NEXT: movl 8(%ebp), %eax +; SSE1-NEXT: xorps %xmm0, %xmm0 +; SSE1-NEXT: movaps %xmm0, 16(%esi) +; SSE1-NEXT: movaps %xmm0, (%esi) +; SSE1-NEXT: addl $3, %eax +; SSE1-NEXT: andl $-4, %eax +; SSE1-NEXT: calll __alloca +; SSE1-NEXT: movl %esp, %eax +; SSE1-NEXT: pushl %eax +; SSE1-NEXT: calll _dummy +; SSE1-NEXT: leal -4(%ebp), %esp +; SSE1-NEXT: popl %esi +; SSE1-NEXT: popl %ebp +; SSE1-NEXT: retl ; -; AVX-LABEL: test1: -; AVX: # %bb.0: -; AVX-NEXT: pushl %ebp -; AVX-NEXT: movl %esp, %ebp -; AVX-NEXT: pushl %esi -; AVX-NEXT: andl $-32, %esp -; AVX-NEXT: subl $64, %esp -; AVX-NEXT: movl %esp, %esi -; AVX-NEXT: movl 8(%ebp), %eax -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %ymm0, (%esi) -; AVX-NEXT: addl $3, %eax -; AVX-NEXT: andl $-4, %eax -; AVX-NEXT: calll __alloca -; AVX-NEXT: movl %esp, %eax -; AVX-NEXT: pushl %eax -; AVX-NEXT: vzeroupper -; AVX-NEXT: calll _dummy -; AVX-NEXT: leal -4(%ebp), %esp -; AVX-NEXT: popl %esi -; AVX-NEXT: popl %ebp -; AVX-NEXT: retl +; SSE2-LABEL: test1: +; SSE2: # %bb.0: +; SSE2-NEXT: pushl %ebp +; SSE2-NEXT: movl %esp, %ebp +; SSE2-NEXT: pushl %esi +; SSE2-NEXT: andl $-16, %esp +; SSE2-NEXT: subl $48, %esp +; SSE2-NEXT: movl 8(%ebp), %eax +; SSE2-NEXT: movl %esp, %esi +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, 16(%esi) +; SSE2-NEXT: movaps %xmm0, (%esi) +; SSE2-NEXT: addl $3, %eax +; SSE2-NEXT: andl $-4, %eax +; SSE2-NEXT: calll __alloca +; SSE2-NEXT: movl %esp, %eax +; SSE2-NEXT: pushl %eax +; SSE2-NEXT: calll _dummy +; SSE2-NEXT: leal -4(%ebp), %esp +; SSE2-NEXT: popl %esi +; SSE2-NEXT: popl %ebp +; SSE2-NEXT: retl +; +; AVX1-LABEL: test1: +; AVX1: # %bb.0: +; AVX1-NEXT: pushl %ebp +; AVX1-NEXT: movl %esp, %ebp +; AVX1-NEXT: pushl %esi +; AVX1-NEXT: andl $-32, %esp +; AVX1-NEXT: subl $64, %esp +; AVX1-NEXT: movl 8(%ebp), %eax +; AVX1-NEXT: movl %esp, %esi +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps %ymm0, (%esi) +; AVX1-NEXT: addl $3, %eax +; AVX1-NEXT: andl $-4, %eax +; AVX1-NEXT: calll __alloca +; AVX1-NEXT: movl %esp, %eax +; AVX1-NEXT: pushl %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: calll _dummy +; AVX1-NEXT: leal -4(%ebp), %esp +; AVX1-NEXT: popl %esi +; AVX1-NEXT: popl %ebp +; AVX1-NEXT: retl +; +; AVX2-LABEL: test1: +; AVX2: # %bb.0: +; AVX2-NEXT: pushl %ebp +; AVX2-NEXT: movl %esp, %ebp +; AVX2-NEXT: pushl %esi +; AVX2-NEXT: andl $-32, %esp +; AVX2-NEXT: subl $64, %esp +; AVX2-NEXT: movl %esp, %esi +; AVX2-NEXT: movl 8(%ebp), %eax +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %ymm0, (%esi) +; AVX2-NEXT: addl $3, %eax +; AVX2-NEXT: andl $-4, %eax +; AVX2-NEXT: calll __alloca +; AVX2-NEXT: movl %esp, %eax +; AVX2-NEXT: pushl %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: calll _dummy +; AVX2-NEXT: leal -4(%ebp), %esp +; AVX2-NEXT: popl %esi +; AVX2-NEXT: popl %ebp +; AVX2-NEXT: retl %tmp1210 = alloca i8, i32 32, align 4 call void @llvm.memset.p0i8.i64(i8* align 4 %tmp1210, i8 0, i64 32, i1 false) %x = alloca i8, i32 %t @@ -106,49 +152,93 @@ ; NOSSE-NEXT: popl %ebp ; NOSSE-NEXT: retl ; -; SSE-LABEL: test2: -; SSE: # %bb.0: -; SSE-NEXT: pushl %ebp -; SSE-NEXT: movl %esp, %ebp -; SSE-NEXT: pushl %esi -; SSE-NEXT: andl $-16, %esp -; SSE-NEXT: subl $32, %esp -; SSE-NEXT: movl %esp, %esi -; SSE-NEXT: movl 8(%ebp), %eax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, (%esi) -; SSE-NEXT: addl $3, %eax -; SSE-NEXT: andl $-4, %eax -; SSE-NEXT: calll __alloca -; SSE-NEXT: movl %esp, %eax -; SSE-NEXT: pushl %eax -; SSE-NEXT: calll _dummy -; SSE-NEXT: leal -4(%ebp), %esp -; SSE-NEXT: popl %esi -; SSE-NEXT: popl %ebp -; SSE-NEXT: retl +; SSE1-LABEL: test2: +; SSE1: # %bb.0: +; SSE1-NEXT: pushl %ebp +; SSE1-NEXT: movl %esp, %ebp +; SSE1-NEXT: pushl %esi +; SSE1-NEXT: andl $-16, %esp +; SSE1-NEXT: subl $32, %esp +; SSE1-NEXT: movl %esp, %esi +; SSE1-NEXT: movl 8(%ebp), %eax +; SSE1-NEXT: xorps %xmm0, %xmm0 +; SSE1-NEXT: movaps %xmm0, (%esi) +; SSE1-NEXT: addl $3, %eax +; SSE1-NEXT: andl $-4, %eax +; SSE1-NEXT: calll __alloca +; SSE1-NEXT: movl %esp, %eax +; SSE1-NEXT: pushl %eax +; SSE1-NEXT: calll _dummy +; SSE1-NEXT: leal -4(%ebp), %esp +; SSE1-NEXT: popl %esi +; SSE1-NEXT: popl %ebp +; SSE1-NEXT: retl +; +; SSE2-LABEL: test2: +; SSE2: # %bb.0: +; SSE2-NEXT: pushl %ebp +; SSE2-NEXT: movl %esp, %ebp +; SSE2-NEXT: pushl %esi +; SSE2-NEXT: andl $-16, %esp +; SSE2-NEXT: subl $32, %esp +; SSE2-NEXT: movl 8(%ebp), %eax +; SSE2-NEXT: movl %esp, %esi +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, (%esi) +; SSE2-NEXT: addl $3, %eax +; SSE2-NEXT: andl $-4, %eax +; SSE2-NEXT: calll __alloca +; SSE2-NEXT: movl %esp, %eax +; SSE2-NEXT: pushl %eax +; SSE2-NEXT: calll _dummy +; SSE2-NEXT: leal -4(%ebp), %esp +; SSE2-NEXT: popl %esi +; SSE2-NEXT: popl %ebp +; SSE2-NEXT: retl +; +; AVX1-LABEL: test2: +; AVX1: # %bb.0: +; AVX1-NEXT: pushl %ebp +; AVX1-NEXT: movl %esp, %ebp +; AVX1-NEXT: pushl %esi +; AVX1-NEXT: andl $-16, %esp +; AVX1-NEXT: subl $32, %esp +; AVX1-NEXT: movl 8(%ebp), %eax +; AVX1-NEXT: movl %esp, %esi +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, (%esi) +; AVX1-NEXT: addl $3, %eax +; AVX1-NEXT: andl $-4, %eax +; AVX1-NEXT: calll __alloca +; AVX1-NEXT: movl %esp, %eax +; AVX1-NEXT: pushl %eax +; AVX1-NEXT: calll _dummy +; AVX1-NEXT: leal -4(%ebp), %esp +; AVX1-NEXT: popl %esi +; AVX1-NEXT: popl %ebp +; AVX1-NEXT: retl ; -; AVX-LABEL: test2: -; AVX: # %bb.0: -; AVX-NEXT: pushl %ebp -; AVX-NEXT: movl %esp, %ebp -; AVX-NEXT: pushl %esi -; AVX-NEXT: andl $-16, %esp -; AVX-NEXT: subl $32, %esp -; AVX-NEXT: movl %esp, %esi -; AVX-NEXT: movl 8(%ebp), %eax -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, (%esi) -; AVX-NEXT: addl $3, %eax -; AVX-NEXT: andl $-4, %eax -; AVX-NEXT: calll __alloca -; AVX-NEXT: movl %esp, %eax -; AVX-NEXT: pushl %eax -; AVX-NEXT: calll _dummy -; AVX-NEXT: leal -4(%ebp), %esp -; AVX-NEXT: popl %esi -; AVX-NEXT: popl %ebp -; AVX-NEXT: retl +; AVX2-LABEL: test2: +; AVX2: # %bb.0: +; AVX2-NEXT: pushl %ebp +; AVX2-NEXT: movl %esp, %ebp +; AVX2-NEXT: pushl %esi +; AVX2-NEXT: andl $-16, %esp +; AVX2-NEXT: subl $32, %esp +; AVX2-NEXT: movl %esp, %esi +; AVX2-NEXT: movl 8(%ebp), %eax +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %xmm0, (%esi) +; AVX2-NEXT: addl $3, %eax +; AVX2-NEXT: andl $-4, %eax +; AVX2-NEXT: calll __alloca +; AVX2-NEXT: movl %esp, %eax +; AVX2-NEXT: pushl %eax +; AVX2-NEXT: calll _dummy +; AVX2-NEXT: leal -4(%ebp), %esp +; AVX2-NEXT: popl %esi +; AVX2-NEXT: popl %ebp +; AVX2-NEXT: retl %tmp1210 = alloca i8, i32 16, align 4 call void @llvm.memset.p0i8.i64(i8* align 4 %tmp1210, i8 0, i64 16, i1 false) %x = alloca i8, i32 %t diff --git a/llvm/test/CodeGen/X86/misched-aa-colored.ll b/llvm/test/CodeGen/X86/misched-aa-colored.ll --- a/llvm/test/CodeGen/X86/misched-aa-colored.ll +++ b/llvm/test/CodeGen/X86/misched-aa-colored.ll @@ -168,8 +168,8 @@ ; CHECK: movl (%rax), %eax ; CHECK-NOT: movl %eax, {{[0-9]+}}(%rsp) ; CHECK: movl [[OFF:[0-9]+]](%rsp), %r8d -; CHECK: movl %eax, [[OFF]](%rsp) ; CHECK: movl $-1, %ecx +; CHECK: movl %eax, [[OFF]](%rsp) ; CHECK: callq _ZN4llvm12SelectionDAG7getNodeEjNS_5SDLocENS_3EVTENS_7SDValueES3_ %call18 = call { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, i32 } @_ZN4llvm12SelectionDAG7getNodeEjNS_5SDLocENS_3EVTENS_7SDValueES3_(%"class.llvm::SelectionDAG.104.704.1064.2024.2144.2384.4184"* undef, i32 undef, i8* undef, i32 -1, i32 %retval.sroa.0.0.copyload.i37, %"class.llvm::Type.7.607.967.1927.2047.2287.4087"* undef, %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* byval align 8 undef, %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* byval align 8 undef) #1 diff --git a/llvm/test/CodeGen/X86/misched-matrix.ll b/llvm/test/CodeGen/X86/misched-matrix.ll --- a/llvm/test/CodeGen/X86/misched-matrix.ll +++ b/llvm/test/CodeGen/X86/misched-matrix.ll @@ -2,10 +2,10 @@ ; RUN: -misched-topdown -verify-machineinstrs \ ; RUN: | FileCheck %s -check-prefix=TOPDOWN ; RUN: llc < %s -mtriple=x86_64-- -mcpu=core2 -pre-RA-sched=source -enable-misched \ -; RUN: -misched=ilpmin -verify-machineinstrs \ +; RUN: -misched=ilpmin -verify-machineinstrs -disable-post-ra \ ; RUN: | FileCheck %s -check-prefix=ILPMIN ; RUN: llc < %s -mtriple=x86_64-- -mcpu=core2 -pre-RA-sched=source -enable-misched \ -; RUN: -misched=ilpmax -verify-machineinstrs \ +; RUN: -misched=ilpmax -verify-machineinstrs -disable-post-ra \ ; RUN: | FileCheck %s -check-prefix=ILPMAX ; ; Verify that the MI scheduler minimizes register pressure for a diff --git a/llvm/test/CodeGen/X86/misched-new.ll b/llvm/test/CodeGen/X86/misched-new.ll --- a/llvm/test/CodeGen/X86/misched-new.ll +++ b/llvm/test/CodeGen/X86/misched-new.ll @@ -16,8 +16,8 @@ ; From oggenc. ; After coalescing, we have a dead superreg (RAX) definition. ; -; CHECK: xorl %esi, %esi ; CHECK: movl $32, %ecx +; CHECK: xorl %esi, %esi ; CHECK: rep;movsl define fastcc void @_preextrapolate_helper() nounwind uwtable ssp { entry: diff --git a/llvm/test/CodeGen/X86/movgs.ll b/llvm/test/CodeGen/X86/movgs.ll --- a/llvm/test/CodeGen/X86/movgs.ll +++ b/llvm/test/CodeGen/X86/movgs.ll @@ -71,16 +71,16 @@ ; X32-LABEL: test_no_cse: ; X32: # %bb.0: # %entry ; X32-NEXT: movl %gs:196, %eax -; X32-NEXT: movl (%eax), %eax ; X32-NEXT: movl %fs:196, %ecx +; X32-NEXT: movl (%eax), %eax ; X32-NEXT: addl (%ecx), %eax ; X32-NEXT: retl ; ; X64-LABEL: test_no_cse: ; X64: # %bb.0: # %entry ; X64-NEXT: movq %gs:320, %rax -; X64-NEXT: movl (%rax), %eax ; X64-NEXT: movq %fs:320, %rcx +; X64-NEXT: movl (%rax), %eax ; X64-NEXT: addl (%rcx), %eax ; X64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/patchpoint-webkit_jscc.ll b/llvm/test/CodeGen/X86/patchpoint-webkit_jscc.ll --- a/llvm/test/CodeGen/X86/patchpoint-webkit_jscc.ll +++ b/llvm/test/CodeGen/X86/patchpoint-webkit_jscc.ll @@ -7,16 +7,16 @@ define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) { entry: ; CHECK-LABEL: jscall_patchpoint_codegen: -; CHECK: movq %r{{.+}}, (%rsp) ; CHECK: movq %r{{.+}}, %rax +; CHECK: movq %r{{.+}}, (%rsp) ; CHECK: Ltmp ; CHECK-NEXT: movabsq $-559038736, %r11 ; CHECK-NEXT: callq *%r11 ; CHECK: movq %rax, (%rsp) ; CHECK: callq ; FAST-LABEL: jscall_patchpoint_codegen: -; FAST: movq %r{{.+}}, (%rsp) ; FAST: movq %r{{.+}}, %rax +; FAST: movq %r{{.+}}, (%rsp) ; FAST: Ltmp ; FAST-NEXT: movabsq $-559038736, %r11 ; FAST-NEXT: callq *%r11 diff --git a/llvm/test/CodeGen/X86/patchpoint.ll b/llvm/test/CodeGen/X86/patchpoint.ll --- a/llvm/test/CodeGen/X86/patchpoint.ll +++ b/llvm/test/CodeGen/X86/patchpoint.ll @@ -9,10 +9,10 @@ ; CHECK: movabsq $-559038736, %r11 ; CHECK-NEXT: callq *%r11 ; CHECK-NEXT: xchgw %ax, %ax -; CHECK: movq %rax, %[[REG:r.+]] +; CHECK: movq %rax, %rbx ; CHECK: callq *%r11 ; CHECK-NEXT: xchgw %ax, %ax -; CHECK: movq %[[REG]], %rax +; CHECK: movq %rbx, %rax ; CHECK: ret %resolveCall2 = inttoptr i64 -559038736 to i8* %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4) diff --git a/llvm/test/CodeGen/X86/pointer-vector.ll b/llvm/test/CodeGen/X86/pointer-vector.ll --- a/llvm/test/CodeGen/X86/pointer-vector.ll +++ b/llvm/test/CodeGen/X86/pointer-vector.ll @@ -128,11 +128,11 @@ define <4 x i32> @ICMP0(<4 x i8*>* %p0, <4 x i8*>* %p1) nounwind { ; CHECK-LABEL: ICMP0: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [9,8,7,6] ; CHECK-NEXT: movdqa (%ecx), %xmm0 ; CHECK-NEXT: pcmpgtd (%eax), %xmm0 -; CHECK-NEXT: movaps {{.*#+}} xmm1 = [9,8,7,6] ; CHECK-NEXT: blendvps %xmm0, {{\.LCPI.*}}, %xmm1 ; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: retl @@ -147,11 +147,11 @@ define <4 x i32> @ICMP1(<4 x i8*>* %p0, <4 x i8*>* %p1) nounwind { ; CHECK-LABEL: ICMP1: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [9,8,7,6] ; CHECK-NEXT: movdqa (%ecx), %xmm0 ; CHECK-NEXT: pcmpeqd (%eax), %xmm0 -; CHECK-NEXT: movaps {{.*#+}} xmm1 = [9,8,7,6] ; CHECK-NEXT: blendvps %xmm0, {{\.LCPI.*}}, %xmm1 ; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/post-ra-sched.ll b/llvm/test/CodeGen/X86/post-ra-sched.ll --- a/llvm/test/CodeGen/X86/post-ra-sched.ll +++ b/llvm/test/CodeGen/X86/post-ra-sched.ll @@ -22,9 +22,9 @@ ; PENTIUM4-LABEL: addindirect: ; PENTIUM4: # %bb.0: # %entry ; PENTIUM4-NEXT: movl idxa, %eax -; PENTIUM4-NEXT: movl ptrs(,%eax,4), %eax ; PENTIUM4-NEXT: movl idxb, %ecx ; PENTIUM4-NEXT: movl ptrs(,%ecx,4), %ecx +; PENTIUM4-NEXT: movl ptrs(,%eax,4), %eax ; PENTIUM4-NEXT: movl (%ecx), %ecx ; PENTIUM4-NEXT: addl (%eax), %ecx ; PENTIUM4-NEXT: movl %ecx, res diff --git a/llvm/test/CodeGen/X86/pow.ll b/llvm/test/CodeGen/X86/pow.ll --- a/llvm/test/CodeGen/X86/pow.ll +++ b/llvm/test/CodeGen/X86/pow.ll @@ -14,28 +14,28 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: rsqrtss %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: mulss %xmm1, %xmm3 ; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: mulss %xmm1, %xmm3 ; CHECK-NEXT: movaps %xmm3, %xmm4 -; CHECK-NEXT: mulss %xmm2, %xmm4 ; CHECK-NEXT: mulss %xmm1, %xmm3 -; CHECK-NEXT: movss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: addss %xmm5, %xmm3 -; CHECK-NEXT: mulss %xmm4, %xmm3 ; CHECK-NEXT: movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; CHECK-NEXT: mulss %xmm2, %xmm4 +; CHECK-NEXT: addss %xmm5, %xmm3 ; CHECK-NEXT: andps %xmm1, %xmm0 -; CHECK-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: cmpltss %xmm4, %xmm0 +; CHECK-NEXT: cmpltss %xmm6, %xmm0 +; CHECK-NEXT: mulss %xmm4, %xmm3 ; CHECK-NEXT: andnps %xmm3, %xmm0 ; CHECK-NEXT: xorps %xmm3, %xmm3 ; CHECK-NEXT: rsqrtss %xmm0, %xmm3 ; CHECK-NEXT: andps %xmm0, %xmm1 +; CHECK-NEXT: cmpltss %xmm6, %xmm1 ; CHECK-NEXT: mulss %xmm3, %xmm0 ; CHECK-NEXT: mulss %xmm0, %xmm2 ; CHECK-NEXT: mulss %xmm3, %xmm0 ; CHECK-NEXT: addss %xmm5, %xmm0 ; CHECK-NEXT: mulss %xmm2, %xmm0 -; CHECK-NEXT: cmpltss %xmm4, %xmm1 ; CHECK-NEXT: andnps %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -48,13 +48,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: rsqrtss %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm0, %xmm2 -; CHECK-NEXT: mulss %xmm1, %xmm2 ; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: mulss %xmm1, %xmm2 ; CHECK-NEXT: movaps %xmm2, %xmm4 -; CHECK-NEXT: mulss %xmm3, %xmm4 ; CHECK-NEXT: mulss %xmm1, %xmm2 -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: addss %xmm1, %xmm2 +; CHECK-NEXT: mulss %xmm3, %xmm4 +; CHECK-NEXT: addss %xmm6, %xmm2 ; CHECK-NEXT: mulss %xmm4, %xmm2 ; CHECK-NEXT: xorps %xmm4, %xmm4 ; CHECK-NEXT: cmpeqss %xmm4, %xmm0 @@ -62,12 +62,12 @@ ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: rsqrtss %xmm0, %xmm2 ; CHECK-NEXT: movaps %xmm0, %xmm5 +; CHECK-NEXT: cmpeqss %xmm4, %xmm0 ; CHECK-NEXT: mulss %xmm2, %xmm5 ; CHECK-NEXT: mulss %xmm5, %xmm3 ; CHECK-NEXT: mulss %xmm2, %xmm5 -; CHECK-NEXT: addss %xmm1, %xmm5 +; CHECK-NEXT: addss %xmm6, %xmm5 ; CHECK-NEXT: mulss %xmm3, %xmm5 -; CHECK-NEXT: cmpeqss %xmm4, %xmm0 ; CHECK-NEXT: andnps %xmm5, %xmm0 ; CHECK-NEXT: retq %r = call nsz ninf afn float @llvm.pow.f32(float %x, float 2.5e-01) @@ -89,29 +89,29 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: rsqrtps %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm0, %xmm2 -; CHECK-NEXT: mulps %xmm1, %xmm2 ; CHECK-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; CHECK-NEXT: movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; CHECK-NEXT: movaps {{.*#+}} xmm7 = [NaN,NaN,NaN,NaN] +; CHECK-NEXT: mulps %xmm1, %xmm2 +; CHECK-NEXT: andps %xmm7, %xmm0 ; CHECK-NEXT: movaps %xmm2, %xmm4 -; CHECK-NEXT: mulps %xmm3, %xmm4 ; CHECK-NEXT: mulps %xmm1, %xmm2 -; CHECK-NEXT: movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; CHECK-NEXT: addps %xmm5, %xmm2 -; CHECK-NEXT: mulps %xmm4, %xmm2 -; CHECK-NEXT: movaps {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN] -; CHECK-NEXT: andps %xmm4, %xmm0 ; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; CHECK-NEXT: mulps %xmm3, %xmm4 +; CHECK-NEXT: addps %xmm5, %xmm2 ; CHECK-NEXT: movaps %xmm1, %xmm6 ; CHECK-NEXT: cmpleps %xmm0, %xmm6 +; CHECK-NEXT: mulps %xmm4, %xmm2 ; CHECK-NEXT: andps %xmm2, %xmm6 ; CHECK-NEXT: rsqrtps %xmm6, %xmm0 ; CHECK-NEXT: movaps %xmm6, %xmm2 +; CHECK-NEXT: andps %xmm7, %xmm6 +; CHECK-NEXT: cmpleps %xmm6, %xmm1 ; CHECK-NEXT: mulps %xmm0, %xmm2 ; CHECK-NEXT: mulps %xmm2, %xmm3 ; CHECK-NEXT: mulps %xmm0, %xmm2 ; CHECK-NEXT: addps %xmm5, %xmm2 ; CHECK-NEXT: mulps %xmm3, %xmm2 -; CHECK-NEXT: andps %xmm4, %xmm6 -; CHECK-NEXT: cmpleps %xmm6, %xmm1 ; CHECK-NEXT: andps %xmm2, %xmm1 ; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -151,25 +151,25 @@ ; CHECK-LABEL: pow_v4f32_one_fourth_not_enough_fmf: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq powf ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq powf ; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq powf ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; CHECK-NEXT: callq powf ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -186,13 +186,13 @@ ; CHECK-LABEL: pow_v2f64_one_fourth_not_enough_fmf: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq pow ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq pow ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] diff --git a/llvm/test/CodeGen/X86/pr11334.ll b/llvm/test/CodeGen/X86/pr11334.ll --- a/llvm/test/CodeGen/X86/pr11334.ll +++ b/llvm/test/CodeGen/X86/pr11334.ll @@ -23,11 +23,11 @@ ; SSE-NEXT: cvtps2pd %xmm0, %xmm2 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvtps2pd %xmm0, %xmm0 -; SSE-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: fldl -{{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: fldl -{{[0-9]+}}(%rsp) ; SSE-NEXT: retq ; ; AVX-LABEL: v3f2d_ext_vec: @@ -61,10 +61,10 @@ ; SSE-LABEL: v8f2d_ext_vec: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvtps2pd %xmm0, %xmm5 -; SSE-NEXT: cvtps2pd %xmm1, %xmm2 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvtps2pd %xmm0, %xmm4 +; SSE-NEXT: cvtps2pd %xmm1, %xmm2 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: cvtps2pd %xmm0, %xmm4 ; SSE-NEXT: cvtps2pd %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: movaps %xmm4, %xmm1 diff --git a/llvm/test/CodeGen/X86/pr14088.ll b/llvm/test/CodeGen/X86/pr14088.ll --- a/llvm/test/CodeGen/X86/pr14088.ll +++ b/llvm/test/CodeGen/X86/pr14088.ll @@ -20,8 +20,8 @@ ; CHECK-NEXT: movslq %r8d, %rax ; CHECK-NEXT: imulq $1374389535, %rax, %rcx # imm = 0x51EB851F ; CHECK-NEXT: movq %rcx, %rdi -; CHECK-NEXT: shrq $63, %rdi ; CHECK-NEXT: sarq $37, %rcx +; CHECK-NEXT: shrq $63, %rdi ; CHECK-NEXT: addl %edi, %ecx ; CHECK-NEXT: imull $100, %ecx, %ecx ; CHECK-NEXT: subl %ecx, %eax @@ -30,8 +30,8 @@ ; CHECK-NEXT: cltq ; CHECK-NEXT: imulq $1717986919, %rax, %rax # imm = 0x66666667 ; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: shrq $63, %rcx ; CHECK-NEXT: shrq $34, %rax +; CHECK-NEXT: shrq $63, %rcx ; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: movb %al, (%rdx) ; CHECK-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/pr14314.ll b/llvm/test/CodeGen/X86/pr14314.ll --- a/llvm/test/CodeGen/X86/pr14314.ll +++ b/llvm/test/CodeGen/X86/pr14314.ll @@ -8,17 +8,17 @@ ; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp ; CHECK-NEXT: movl (%ebp), %eax ; CHECK-NEXT: movl 4(%ebp), %edx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: movl %eax, %ebx -; CHECK-NEXT: subl %edi, %ebx ; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: subl %edi, %ebx ; CHECK-NEXT: sbbl %esi, %ecx ; CHECK-NEXT: lock cmpxchg8b (%ebp) ; CHECK-NEXT: jne .LBB0_1 diff --git a/llvm/test/CodeGen/X86/pr15267.ll b/llvm/test/CodeGen/X86/pr15267.ll --- a/llvm/test/CodeGen/X86/pr15267.ll +++ b/llvm/test/CodeGen/X86/pr15267.ll @@ -6,18 +6,18 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movzwl (%rdi), %eax ; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: shrl $3, %ecx -; CHECK-NEXT: andl $7, %ecx ; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: shrl $3, %ecx ; CHECK-NEXT: andl $7, %edx ; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: shrl $6, %ecx +; CHECK-NEXT: movl %eax, %edx ; CHECK-NEXT: andl $7, %ecx -; CHECK-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: shrl $9, %eax +; CHECK-NEXT: shrl $6, %edx +; CHECK-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: andl $7, %eax +; CHECK-NEXT: andl $7, %edx +; CHECK-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 ; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ; CHECK-NEXT: retq %ret = load <4 x i3>, <4 x i3>* %in, align 1 @@ -29,21 +29,21 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movb (%rdi), %al ; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: shrb %cl -; CHECK-NEXT: andb $1, %cl -; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: movl %eax, %edx ; CHECK-NEXT: andb $1, %dl +; CHECK-NEXT: shrb %cl +; CHECK-NEXT: andb $1, %cl ; CHECK-NEXT: movzbl %dl, %edx +; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vmovd %edx, %xmm0 ; CHECK-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrb $3, %al ; CHECK-NEXT: shrb $2, %cl +; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: andb $1, %cl ; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 -; CHECK-NEXT: shrb $3, %al -; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; CHECK-NEXT: retq %ret = load <4 x i1>, <4 x i1>* %in, align 1 @@ -56,24 +56,24 @@ ; CHECK-NEXT: movb (%rdi), %al ; CHECK-NEXT: movzbl %al, %ecx ; CHECK-NEXT: shrb %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: negl %eax ; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: negl %edx +; CHECK-NEXT: negl %eax ; CHECK-NEXT: vmovd %edx, %xmm0 ; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: shrb $3, %cl ; CHECK-NEXT: shrb $2, %al +; CHECK-NEXT: movzbl %cl, %edx ; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: negl %edx ; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: negl %eax ; CHECK-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; CHECK-NEXT: shrb $3, %cl -; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: negl %eax -; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 @@ -89,10 +89,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq (%rdi), %rax ; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: shrl $4, %ecx -; CHECK-NEXT: andl $15, %ecx ; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: shrl $4, %ecx ; CHECK-NEXT: andl $15, %edx +; CHECK-NEXT: andl $15, %ecx ; CHECK-NEXT: vmovd %edx, %xmm0 ; CHECK-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx @@ -143,10 +143,10 @@ ; CHECK-NEXT: andl $15, %ecx ; CHECK-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq $60, %rax ; CHECK-NEXT: shrq $56, %rcx ; CHECK-NEXT: andl $15, %ecx ; CHECK-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; CHECK-NEXT: shrq $60, %rax ; CHECK-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; CHECK-NEXT: retq %ret = load <16 x i4>, <16 x i4>* %in, align 1 diff --git a/llvm/test/CodeGen/X86/pr21792.ll b/llvm/test/CodeGen/X86/pr21792.ll --- a/llvm/test/CodeGen/X86/pr21792.ll +++ b/llvm/test/CodeGen/X86/pr21792.ll @@ -12,15 +12,15 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: movd %xmm0, %r8d -; CHECK-NEXT: leaq stuff(%r8), %rdi ; CHECK-NEXT: pextrd $1, %xmm0, %eax -; CHECK-NEXT: leaq stuff(%rax), %rsi ; CHECK-NEXT: pextrd $2, %xmm0, %edx ; CHECK-NEXT: pextrd $3, %xmm0, %ecx +; CHECK-NEXT: movd %xmm0, %r8d +; CHECK-NEXT: leaq stuff(%r8), %rdi +; CHECK-NEXT: leaq stuff+8(%r8), %r8 +; CHECK-NEXT: leaq stuff(%rax), %rsi ; CHECK-NEXT: leaq stuff(%rdx), %rdx ; CHECK-NEXT: leaq stuff(%rcx), %rcx -; CHECK-NEXT: leaq stuff+8(%r8), %r8 ; CHECK-NEXT: leaq stuff+8(%rax), %r9 ; CHECK-NEXT: callq toto ; CHECK-NEXT: popq %rax diff --git a/llvm/test/CodeGen/X86/pr31045.ll b/llvm/test/CodeGen/X86/pr31045.ll --- a/llvm/test/CodeGen/X86/pr31045.ll +++ b/llvm/test/CodeGen/X86/pr31045.ll @@ -21,6 +21,7 @@ ; CHECK-NEXT: movl struct_obj_3+{{.*}}(%rip), %eax ; CHECK-NEXT: movzbl {{.*}}(%rip), %ecx ; CHECK-NEXT: movzbl {{.*}}(%rip), %edx +; CHECK-NEXT: movb $0, {{.*}}(%rip) ; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: addl %eax, %eax ; CHECK-NEXT: subl %ecx, %eax @@ -28,7 +29,6 @@ ; CHECK-NEXT: notl %eax ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: movw %ax, struct_obj_12+{{.*}}(%rip) -; CHECK-NEXT: movb $0, {{.*}}(%rip) ; CHECK-NEXT: retq entry: %bf.load = load i32, i32* bitcast (i24* getelementptr inbounds (%struct.d.3.7.11.15.39.71.75.91.95.99.107.123.363, %struct.d.3.7.11.15.39.71.75.91.95.99.107.123.363* @struct_obj_3, i64 0, i32 0, i32 2) to i32*), align 2 diff --git a/llvm/test/CodeGen/X86/pr32610.ll b/llvm/test/CodeGen/X86/pr32610.ll --- a/llvm/test/CodeGen/X86/pr32610.ll +++ b/llvm/test/CodeGen/X86/pr32610.ll @@ -13,8 +13,8 @@ ; CHECK-NEXT: pushl %ebp ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: pushl %esi -; CHECK-NEXT: movl 8(%ebp), %ecx ; CHECK-NEXT: movl L_b$non_lazy_ptr, %edx +; CHECK-NEXT: movl 8(%ebp), %ecx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl (%edx), %ecx ; CHECK-NEXT: sete %al @@ -23,15 +23,15 @@ ; CHECK-NEXT: cmpl $0, 12(%ebp) ; CHECK-NEXT: cmovel %esi, %eax ; CHECK-NEXT: cmpl (%edx), %ecx -; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: movl L_c$non_lazy_ptr, %ecx +; CHECK-NEXT: cmovnel %esi, %eax +; CHECK-NEXT: movl (%edx), %esi +; CHECK-NEXT: movl L_d$non_lazy_ptr, %edx ; CHECK-NEXT: movl %eax, (%ecx) -; CHECK-NEXT: movl (%edx), %eax -; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: movl $2, %ecx -; CHECK-NEXT: cmovnel %eax, %ecx -; CHECK-NEXT: movl L_d$non_lazy_ptr, %eax -; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: cmovnel %esi, %ecx +; CHECK-NEXT: movl %ecx, (%edx) ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/pr34088.ll b/llvm/test/CodeGen/X86/pr34088.ll --- a/llvm/test/CodeGen/X86/pr34088.ll +++ b/llvm/test/CodeGen/X86/pr34088.ll @@ -19,12 +19,12 @@ ; CHECK-NEXT: subl $32, %esp ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: movaps %xmm0, (%esp) -; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movaps {{.*#+}} xmm1 = [205,205,205,205,205,205,205,205,205,205,205,205,205,205,205,205] -; CHECK-NEXT: movaps %xmm1, (%esp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movl $-842150451, {{[0-9]+}}(%esp) # imm = 0xCDCDCDCD +; CHECK-NEXT: movaps %xmm1, (%esp) ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: .cfi_def_cfa %esp, 4 diff --git a/llvm/test/CodeGen/X86/pr34629.ll b/llvm/test/CodeGen/X86/pr34629.ll --- a/llvm/test/CodeGen/X86/pr34629.ll +++ b/llvm/test/CodeGen/X86/pr34629.ll @@ -13,8 +13,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq {{.*}}(%rip), %rax ; CHECK-NEXT: leaq (%rax,%rax,4), %rcx -; CHECK-NEXT: negq %rcx ; CHECK-NEXT: leaq (%rax,%rax,8), %rax +; CHECK-NEXT: negq %rcx ; CHECK-NEXT: leaq (%rax,%rax,4), %rax ; CHECK-NEXT: testq %rax, %rcx ; CHECK-NEXT: je .LBB0_2 diff --git a/llvm/test/CodeGen/X86/pr40539.ll b/llvm/test/CodeGen/X86/pr40539.ll --- a/llvm/test/CodeGen/X86/pr40539.ll +++ b/llvm/test/CodeGen/X86/pr40539.ll @@ -40,6 +40,7 @@ ; CHECK-NEXT: subl $8, %esp ; CHECK-NEXT: .cfi_def_cfa_offset 12 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: divss {{\.LCPI.*}}, %xmm0 ; CHECK-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; CHECK-NEXT: flds {{[0-9]+}}(%esp) @@ -48,7 +49,6 @@ ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: fstps (%esp) ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss %xmm0, %xmm1 ; CHECK-NEXT: setae %cl ; CHECK-NEXT: ucomiss {{\.LCPI.*}}, %xmm0 diff --git a/llvm/test/CodeGen/X86/preserve_mostcc64.ll b/llvm/test/CodeGen/X86/preserve_mostcc64.ll --- a/llvm/test/CodeGen/X86/preserve_mostcc64.ll +++ b/llvm/test/CodeGen/X86/preserve_mostcc64.ll @@ -43,7 +43,6 @@ define void @preserve_mostcc2() nounwind { entry: ;SSE-LABEL: preserve_mostcc2 -;SSE: movq %r11, [[REG:%[a-z0-9]+]] ;SSE: movaps %xmm2 ;SSE: movaps %xmm3 ;SSE: movaps %xmm4 @@ -57,6 +56,7 @@ ;SSE: movaps %xmm12 ;SSE: movaps %xmm13 ;SSE: movaps %xmm14 +;SSE: movq %r11, [[REG:%[a-z0-9]+]] ;SSE: movaps %xmm15 ;SSE: movq [[REG]], %r11 %a0 = call i64 asm sideeffect "", "={rax}"() nounwind diff --git a/llvm/test/CodeGen/X86/rdrand.ll b/llvm/test/CodeGen/X86/rdrand.ll --- a/llvm/test/CodeGen/X86/rdrand.ll +++ b/llvm/test/CodeGen/X86/rdrand.ll @@ -8,8 +8,8 @@ define i32 @_rdrand16_step(i16* %random_val) { ; X86-LABEL: _rdrand16_step: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: rdrandw %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl %ax, %edx ; X86-NEXT: movl $1, %eax ; X86-NEXT: cmovael %edx, %eax @@ -34,8 +34,8 @@ define i32 @_rdrand32_step(i32* %random_val) { ; X86-LABEL: _rdrand32_step: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: rdrandl %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $1, %eax ; X86-NEXT: cmovael %edx, %eax ; X86-NEXT: movl %edx, (%ecx) diff --git a/llvm/test/CodeGen/X86/rdseed.ll b/llvm/test/CodeGen/X86/rdseed.ll --- a/llvm/test/CodeGen/X86/rdseed.ll +++ b/llvm/test/CodeGen/X86/rdseed.ll @@ -8,8 +8,8 @@ define i32 @_rdseed16_step(i16* %random_val) { ; X86-LABEL: _rdseed16_step: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: rdseedw %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl %ax, %edx ; X86-NEXT: movl $1, %eax ; X86-NEXT: cmovael %edx, %eax @@ -34,8 +34,8 @@ define i32 @_rdseed32_step(i32* %random_val) { ; X86-LABEL: _rdseed32_step: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: rdseedl %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $1, %eax ; X86-NEXT: cmovael %edx, %eax ; X86-NEXT: movl %edx, (%ecx) diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll --- a/llvm/test/CodeGen/X86/recip-fastmath.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath.ll @@ -84,8 +84,8 @@ ; SANDY-LABEL: f32_one_step: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -275,8 +275,8 @@ ; SANDY-LABEL: f32_two_step: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 ; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 ; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 ; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 ; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -433,8 +433,8 @@ ; SANDY-LABEL: v4f32_one_step: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -635,8 +635,8 @@ ; SANDY-LABEL: v4f32_two_step: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 ; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -803,8 +803,8 @@ ; SANDY-LABEL: v8f32_one_step: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -931,8 +931,8 @@ ; SANDY-LABEL: v8f32_two_step: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -1144,16 +1144,16 @@ ; SANDY-LABEL: v16f32_one_step: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; SANDY-NEXT: vrcpps %ymm1, %ymm4 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 +; SANDY-NEXT: vmulps %ymm1, %ymm4, %ymm1 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vrcpps %ymm1, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v16f32_one_step: @@ -1330,8 +1330,8 @@ ; SANDY-LABEL: v16f32_two_step: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll --- a/llvm/test/CodeGen/X86/recip-fastmath2.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll @@ -84,8 +84,8 @@ ; ; SANDY-LABEL: f32_one_step_2: ; SANDY: # %bb.0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm3 ; SANDY-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 @@ -184,8 +184,8 @@ ; SANDY-LABEL: f32_one_step_2_divs: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -305,15 +305,15 @@ ; SANDY-LABEL: f32_two_step_2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 ; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SANDY-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 ; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 ; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 ; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; SANDY-NEXT: vmulss %xmm4, %xmm1, %xmm3 ; SANDY-NEXT: vmulss %xmm3, %xmm0, %xmm0 -; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 +; SANDY-NEXT: vsubss %xmm0, %xmm4, %xmm0 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq @@ -417,8 +417,8 @@ ; ; SANDY-LABEL: v4f32_one_step2: ; SANDY: # %bb.0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 ; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SANDY-NEXT: vrcpps %xmm0, %xmm1 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm3 ; SANDY-NEXT: vmulps %xmm3, %xmm0, %xmm0 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 @@ -517,8 +517,8 @@ ; SANDY-LABEL: v4f32_one_step_2_divs: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -649,15 +649,15 @@ ; SANDY-LABEL: v4f32_two_step2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 ; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] -; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; SANDY-NEXT: vmulps %xmm4, %xmm1, %xmm3 ; SANDY-NEXT: vmulps %xmm3, %xmm0, %xmm0 -; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; SANDY-NEXT: vsubps %xmm0, %xmm4, %xmm0 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq @@ -879,8 +879,8 @@ ; SANDY-LABEL: v8f32_one_step_2_divs: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -1026,15 +1026,15 @@ ; SANDY-LABEL: v8f32_two_step2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm0 -; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 +; SANDY-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: retq @@ -1224,16 +1224,16 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SANDY-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; SANDY-NEXT: vmulps %ymm4, %ymm0, %ymm0 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: vrcpps %ymm1, %ymm3 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm4, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] -; SANDY-NEXT: vmulps %ymm2, %ymm3, %ymm4 +; SANDY-NEXT: vmulps %ymm5, %ymm3, %ymm4 ; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 -; SANDY-NEXT: vsubps %ymm1, %ymm2, %ymm1 +; SANDY-NEXT: vsubps %ymm1, %ymm5, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm3, %ymm1 ; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; SANDY-NEXT: retq @@ -1394,18 +1394,18 @@ ; SANDY-LABEL: v16f32_one_step_2_divs: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: vrcpps %ymm1, %ymm4 -; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 +; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vmulps %ymm1, %ymm4, %ymm1 +; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 ; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 ; SANDY-NEXT: vmulps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 ; SANDY-NEXT: retq @@ -1618,26 +1618,26 @@ ; SANDY-LABEL: v16f32_two_step2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovaps {{.*#+}} ymm7 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SANDY-NEXT: vrcpps %ymm1, %ymm6 +; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm5 +; SANDY-NEXT: vmulps %ymm7, %ymm2, %ymm5 ; SANDY-NEXT: vmulps %ymm5, %ymm0, %ymm0 -; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; SANDY-NEXT: vrcpps %ymm1, %ymm3 +; SANDY-NEXT: vsubps %ymm0, %ymm7, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm5, %ymm0 -; SANDY-NEXT: vmulps %ymm3, %ymm1, %ymm2 +; SANDY-NEXT: vmulps %ymm6, %ymm1, %ymm2 ; SANDY-NEXT: vsubps %ymm2, %ymm4, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm3, %ymm2 -; SANDY-NEXT: vaddps %ymm2, %ymm3, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] -; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; SANDY-NEXT: vaddps %ymm0, %ymm5, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; SANDY-NEXT: vmulps %ymm2, %ymm6, %ymm2 +; SANDY-NEXT: vaddps %ymm2, %ymm6, %ymm2 +; SANDY-NEXT: vmulps %ymm5, %ymm2, %ymm4 ; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 -; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 +; SANDY-NEXT: vsubps %ymm1, %ymm5, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 ; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; SANDY-NEXT: retq diff --git a/llvm/test/CodeGen/X86/rot32.ll b/llvm/test/CodeGen/X86/rot32.ll --- a/llvm/test/CodeGen/X86/rot32.ll +++ b/llvm/test/CodeGen/X86/rot32.ll @@ -14,13 +14,29 @@ ; CHECK32-NEXT: roll %cl, %eax ; CHECK32-NEXT: retl ; -; CHECK64-LABEL: foo: -; CHECK64: # %bb.0: # %entry -; CHECK64-NEXT: movl %edx, %ecx -; CHECK64-NEXT: movl %edi, %eax -; CHECK64-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK64-NEXT: roll %cl, %eax -; CHECK64-NEXT: retq +; X64-LABEL: foo: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: roll %cl, %eax +; X64-NEXT: retq +; +; SHLD64-LABEL: foo: +; SHLD64: # %bb.0: # %entry +; SHLD64-NEXT: movl %edi, %eax +; SHLD64-NEXT: movl %edx, %ecx +; SHLD64-NEXT: # kill: def $cl killed $cl killed $ecx +; SHLD64-NEXT: roll %cl, %eax +; SHLD64-NEXT: retq +; +; BMI264-LABEL: foo: +; BMI264: # %bb.0: # %entry +; BMI264-NEXT: movl %edx, %ecx +; BMI264-NEXT: movl %edi, %eax +; BMI264-NEXT: # kill: def $cl killed $cl killed $ecx +; BMI264-NEXT: roll %cl, %eax +; BMI264-NEXT: retq entry: %0 = shl i32 %x, %z %1 = sub i32 32, %z @@ -38,13 +54,29 @@ ; CHECK32-NEXT: shldl %cl, %edx, %eax ; CHECK32-NEXT: retl ; -; CHECK64-LABEL: bar: -; CHECK64: # %bb.0: # %entry -; CHECK64-NEXT: movl %edx, %ecx -; CHECK64-NEXT: movl %esi, %eax -; CHECK64-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK64-NEXT: shldl %cl, %edi, %eax -; CHECK64-NEXT: retq +; X64-LABEL: bar: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shldl %cl, %edi, %eax +; X64-NEXT: retq +; +; SHLD64-LABEL: bar: +; SHLD64: # %bb.0: # %entry +; SHLD64-NEXT: movl %esi, %eax +; SHLD64-NEXT: movl %edx, %ecx +; SHLD64-NEXT: # kill: def $cl killed $cl killed $ecx +; SHLD64-NEXT: shldl %cl, %edi, %eax +; SHLD64-NEXT: retq +; +; BMI264-LABEL: bar: +; BMI264: # %bb.0: # %entry +; BMI264-NEXT: movl %edx, %ecx +; BMI264-NEXT: movl %esi, %eax +; BMI264-NEXT: # kill: def $cl killed $cl killed $ecx +; BMI264-NEXT: shldl %cl, %edi, %eax +; BMI264-NEXT: retq entry: %0 = shl i32 %y, %z %1 = sub i32 32, %z @@ -61,13 +93,29 @@ ; CHECK32-NEXT: rorl %cl, %eax ; CHECK32-NEXT: retl ; -; CHECK64-LABEL: un: -; CHECK64: # %bb.0: # %entry -; CHECK64-NEXT: movl %edx, %ecx -; CHECK64-NEXT: movl %edi, %eax -; CHECK64-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK64-NEXT: rorl %cl, %eax -; CHECK64-NEXT: retq +; X64-LABEL: un: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: rorl %cl, %eax +; X64-NEXT: retq +; +; SHLD64-LABEL: un: +; SHLD64: # %bb.0: # %entry +; SHLD64-NEXT: movl %edi, %eax +; SHLD64-NEXT: movl %edx, %ecx +; SHLD64-NEXT: # kill: def $cl killed $cl killed $ecx +; SHLD64-NEXT: rorl %cl, %eax +; SHLD64-NEXT: retq +; +; BMI264-LABEL: un: +; BMI264: # %bb.0: # %entry +; BMI264-NEXT: movl %edx, %ecx +; BMI264-NEXT: movl %edi, %eax +; BMI264-NEXT: # kill: def $cl killed $cl killed $ecx +; BMI264-NEXT: rorl %cl, %eax +; BMI264-NEXT: retq entry: %0 = lshr i32 %x, %z %1 = sub i32 32, %z @@ -85,13 +133,29 @@ ; CHECK32-NEXT: shrdl %cl, %edx, %eax ; CHECK32-NEXT: retl ; -; CHECK64-LABEL: bu: -; CHECK64: # %bb.0: # %entry -; CHECK64-NEXT: movl %edx, %ecx -; CHECK64-NEXT: movl %esi, %eax -; CHECK64-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK64-NEXT: shrdl %cl, %edi, %eax -; CHECK64-NEXT: retq +; X64-LABEL: bu: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shrdl %cl, %edi, %eax +; X64-NEXT: retq +; +; SHLD64-LABEL: bu: +; SHLD64: # %bb.0: # %entry +; SHLD64-NEXT: movl %esi, %eax +; SHLD64-NEXT: movl %edx, %ecx +; SHLD64-NEXT: # kill: def $cl killed $cl killed $ecx +; SHLD64-NEXT: shrdl %cl, %edi, %eax +; SHLD64-NEXT: retq +; +; BMI264-LABEL: bu: +; BMI264: # %bb.0: # %entry +; BMI264-NEXT: movl %edx, %ecx +; BMI264-NEXT: movl %esi, %eax +; BMI264-NEXT: # kill: def $cl killed $cl killed $ecx +; BMI264-NEXT: shrdl %cl, %edi, %eax +; BMI264-NEXT: retq entry: %0 = lshr i32 %y, %z %1 = sub i32 32, %z diff --git a/llvm/test/CodeGen/X86/rot64.ll b/llvm/test/CodeGen/X86/rot64.ll --- a/llvm/test/CodeGen/X86/rot64.ll +++ b/llvm/test/CodeGen/X86/rot64.ll @@ -4,13 +4,29 @@ ; RUN: llc < %s -mtriple=x86_64-- -mcpu=core-avx2 | FileCheck %s --check-prefix=ALL --check-prefix=BMI2 define i64 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone { -; ALL-LABEL: foo: -; ALL: # %bb.0: # %entry -; ALL-NEXT: movq %rdx, %rcx -; ALL-NEXT: movq %rdi, %rax -; ALL-NEXT: # kill: def $cl killed $cl killed $rcx -; ALL-NEXT: rolq %cl, %rax -; ALL-NEXT: retq +; X64-LABEL: foo: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-NEXT: rolq %cl, %rax +; X64-NEXT: retq +; +; SHLD-LABEL: foo: +; SHLD: # %bb.0: # %entry +; SHLD-NEXT: movq %rdi, %rax +; SHLD-NEXT: movq %rdx, %rcx +; SHLD-NEXT: # kill: def $cl killed $cl killed $rcx +; SHLD-NEXT: rolq %cl, %rax +; SHLD-NEXT: retq +; +; BMI2-LABEL: foo: +; BMI2: # %bb.0: # %entry +; BMI2-NEXT: movq %rdx, %rcx +; BMI2-NEXT: movq %rdi, %rax +; BMI2-NEXT: # kill: def $cl killed $cl killed $rcx +; BMI2-NEXT: rolq %cl, %rax +; BMI2-NEXT: retq entry: %0 = shl i64 %x, %z %1 = sub i64 64, %z @@ -20,13 +36,29 @@ } define i64 @bar(i64 %x, i64 %y, i64 %z) nounwind readnone { -; ALL-LABEL: bar: -; ALL: # %bb.0: # %entry -; ALL-NEXT: movq %rdx, %rcx -; ALL-NEXT: movq %rsi, %rax -; ALL-NEXT: # kill: def $cl killed $cl killed $rcx -; ALL-NEXT: shldq %cl, %rdi, %rax -; ALL-NEXT: retq +; X64-LABEL: bar: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-NEXT: shldq %cl, %rdi, %rax +; X64-NEXT: retq +; +; SHLD-LABEL: bar: +; SHLD: # %bb.0: # %entry +; SHLD-NEXT: movq %rsi, %rax +; SHLD-NEXT: movq %rdx, %rcx +; SHLD-NEXT: # kill: def $cl killed $cl killed $rcx +; SHLD-NEXT: shldq %cl, %rdi, %rax +; SHLD-NEXT: retq +; +; BMI2-LABEL: bar: +; BMI2: # %bb.0: # %entry +; BMI2-NEXT: movq %rdx, %rcx +; BMI2-NEXT: movq %rsi, %rax +; BMI2-NEXT: # kill: def $cl killed $cl killed $rcx +; BMI2-NEXT: shldq %cl, %rdi, %rax +; BMI2-NEXT: retq entry: %0 = shl i64 %y, %z %1 = sub i64 64, %z @@ -36,13 +68,29 @@ } define i64 @un(i64 %x, i64 %y, i64 %z) nounwind readnone { -; ALL-LABEL: un: -; ALL: # %bb.0: # %entry -; ALL-NEXT: movq %rdx, %rcx -; ALL-NEXT: movq %rdi, %rax -; ALL-NEXT: # kill: def $cl killed $cl killed $rcx -; ALL-NEXT: rorq %cl, %rax -; ALL-NEXT: retq +; X64-LABEL: un: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-NEXT: rorq %cl, %rax +; X64-NEXT: retq +; +; SHLD-LABEL: un: +; SHLD: # %bb.0: # %entry +; SHLD-NEXT: movq %rdi, %rax +; SHLD-NEXT: movq %rdx, %rcx +; SHLD-NEXT: # kill: def $cl killed $cl killed $rcx +; SHLD-NEXT: rorq %cl, %rax +; SHLD-NEXT: retq +; +; BMI2-LABEL: un: +; BMI2: # %bb.0: # %entry +; BMI2-NEXT: movq %rdx, %rcx +; BMI2-NEXT: movq %rdi, %rax +; BMI2-NEXT: # kill: def $cl killed $cl killed $rcx +; BMI2-NEXT: rorq %cl, %rax +; BMI2-NEXT: retq entry: %0 = lshr i64 %x, %z %1 = sub i64 64, %z @@ -52,13 +100,29 @@ } define i64 @bu(i64 %x, i64 %y, i64 %z) nounwind readnone { -; ALL-LABEL: bu: -; ALL: # %bb.0: # %entry -; ALL-NEXT: movq %rdx, %rcx -; ALL-NEXT: movq %rsi, %rax -; ALL-NEXT: # kill: def $cl killed $cl killed $rcx -; ALL-NEXT: shrdq %cl, %rdi, %rax -; ALL-NEXT: retq +; X64-LABEL: bu: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-NEXT: shrdq %cl, %rdi, %rax +; X64-NEXT: retq +; +; SHLD-LABEL: bu: +; SHLD: # %bb.0: # %entry +; SHLD-NEXT: movq %rsi, %rax +; SHLD-NEXT: movq %rdx, %rcx +; SHLD-NEXT: # kill: def $cl killed $cl killed $rcx +; SHLD-NEXT: shrdq %cl, %rdi, %rax +; SHLD-NEXT: retq +; +; BMI2-LABEL: bu: +; BMI2: # %bb.0: # %entry +; BMI2-NEXT: movq %rdx, %rcx +; BMI2-NEXT: movq %rsi, %rax +; BMI2-NEXT: # kill: def $cl killed $cl killed $rcx +; BMI2-NEXT: shrdq %cl, %rdi, %rax +; BMI2-NEXT: retq entry: %0 = lshr i64 %y, %z %1 = sub i64 64, %z diff --git a/llvm/test/CodeGen/X86/sandybridge-loads.ll b/llvm/test/CodeGen/X86/sandybridge-loads.ll --- a/llvm/test/CodeGen/X86/sandybridge-loads.ll +++ b/llvm/test/CodeGen/X86/sandybridge-loads.ll @@ -5,10 +5,10 @@ ; CHECK-LABEL: wideloads: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %xmm0 -; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 ; CHECK-NEXT: vmovaps (%rsi), %ymm1 -; CHECK-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 ; CHECK-NEXT: vmovaps (%rdx), %ymm2 +; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 +; CHECK-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 ; CHECK-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/sibcall.ll b/llvm/test/CodeGen/X86/sibcall.ll --- a/llvm/test/CodeGen/X86/sibcall.ll +++ b/llvm/test/CodeGen/X86/sibcall.ll @@ -362,11 +362,11 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl 16(%eax), %ecx ; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: movsd %xmm1, {{[0-9]+}}(%esp) ; X86-NEXT: movsd %xmm0, (%esp) -; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: calll foo7 ; X86-NEXT: addl $28, %esp ; X86-NEXT: retl @@ -706,8 +706,8 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl %ecx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, %esi ; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll f_sret ; X86-NEXT: movl %esi, %eax @@ -741,8 +741,8 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: calll t21_f_sret ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $8, %esp @@ -752,8 +752,8 @@ ; X64-LABEL: t21_sret_to_sret_second_arg_sret: ; X64: # %bb.0: ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rsi, %rbx ; X64-NEXT: movq %rsi, %rdi +; X64-NEXT: movq %rsi, %rbx ; X64-NEXT: callq t21_f_sret ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: popq %rbx @@ -762,8 +762,8 @@ ; X32-LABEL: t21_sret_to_sret_second_arg_sret: ; X32: # %bb.0: ; X32-NEXT: pushq %rbx -; X32-NEXT: movl %esi, %ebx ; X32-NEXT: movl %esi, %edi +; X32-NEXT: movl %esi, %ebx ; X32-NEXT: callq t21_f_sret ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: popq %rbx @@ -777,9 +777,9 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl %ecx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edx, (%esp) +; X86-NEXT: movl %ecx, %esi ; X86-NEXT: movl %eax, %edx ; X86-NEXT: calll f_sret ; X86-NEXT: movl %esi, %eax @@ -791,8 +791,8 @@ ; X64: # %bb.0: ; X64-NEXT: pushq %rbx ; X64-NEXT: movl %esi, %eax -; X64-NEXT: movq %rdi, %rbx ; X64-NEXT: movl %edx, %esi +; X64-NEXT: movq %rdi, %rbx ; X64-NEXT: movl %eax, %edx ; X64-NEXT: callq f_sret ; X64-NEXT: movq %rbx, %rax @@ -803,8 +803,8 @@ ; X32: # %bb.0: ; X32-NEXT: pushq %rbx ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %edi, %ebx ; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edi, %ebx ; X32-NEXT: movl %eax, %edx ; X32-NEXT: callq f_sret ; X32-NEXT: movl %ebx, %eax diff --git a/llvm/test/CodeGen/X86/sjlj.ll b/llvm/test/CodeGen/X86/sjlj.ll --- a/llvm/test/CodeGen/X86/sjlj.ll +++ b/llvm/test/CodeGen/X86/sjlj.ll @@ -26,9 +26,9 @@ ; x86: movl ${{.*LBB.*}}, buf+4 ; X86: ret ; PIC86: sj0 -; PIC86: movl %ebp, buf@GOTOFF(%[[GOT:.*]]) +; PIC86: leal {{.*LBB.*}}@GOTOFF(%[[GOT:.*]]), %[[LREG:.*]] +; PIC86: movl %ebp, buf@GOTOFF(%[[GOT]]) ; PIC86: movl %esp, buf@GOTOFF+8(%[[GOT]]) -; PIC86: leal {{.*LBB.*}}@GOTOFF(%[[GOT]]), %[[LREG:.*]] ; PIC86: movl %[[LREG]], buf@GOTOFF+4 ; PIC86: ret ; X64: sj0 @@ -37,9 +37,9 @@ ; X64: movq %rsp, buf+16(%rip) ; X64: ret ; PIC64: sj0 +; PIC64: leaq {{.*LBB.*}}(%rip), %[[LREG:.*]] ; PIC64: movq %rbp, buf(%rip) ; PIC64: movq %rsp, buf+16(%rip) -; PIC64: leaq {{.*LBB.*}}(%rip), %[[LREG:.*]] ; PIC64: movq %[[LREG]], buf+8(%rip) ; PIC64: ret } diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll --- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll @@ -9,14 +9,14 @@ ; NHM: # %bb.0: ; NHM-NEXT: rsqrtss %xmm0, %xmm1 ; NHM-NEXT: movaps %xmm0, %xmm2 -; NHM-NEXT: mulss %xmm1, %xmm2 ; NHM-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; NHM-NEXT: andps {{.*}}(%rip), %xmm0 +; NHM-NEXT: cmpltss {{.*}}(%rip), %xmm0 +; NHM-NEXT: mulss %xmm1, %xmm2 ; NHM-NEXT: mulss %xmm2, %xmm3 ; NHM-NEXT: mulss %xmm1, %xmm2 ; NHM-NEXT: addss {{.*}}(%rip), %xmm2 -; NHM-NEXT: andps {{.*}}(%rip), %xmm0 ; NHM-NEXT: mulss %xmm3, %xmm2 -; NHM-NEXT: cmpltss {{.*}}(%rip), %xmm0 ; NHM-NEXT: andnps %xmm2, %xmm0 ; NHM-NEXT: retq ; @@ -33,15 +33,15 @@ ; NHM: # %bb.0: ; NHM-NEXT: rsqrtps %xmm0, %xmm2 ; NHM-NEXT: movaps %xmm0, %xmm1 -; NHM-NEXT: mulps %xmm2, %xmm1 ; NHM-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; NHM-NEXT: andps {{.*}}(%rip), %xmm0 +; NHM-NEXT: mulps %xmm2, %xmm1 ; NHM-NEXT: mulps %xmm1, %xmm3 ; NHM-NEXT: mulps %xmm2, %xmm1 ; NHM-NEXT: addps {{.*}}(%rip), %xmm1 -; NHM-NEXT: andps {{.*}}(%rip), %xmm0 -; NHM-NEXT: mulps %xmm3, %xmm1 ; NHM-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; NHM-NEXT: cmpleps %xmm0, %xmm2 +; NHM-NEXT: mulps %xmm3, %xmm1 ; NHM-NEXT: andps %xmm2, %xmm1 ; NHM-NEXT: movaps %xmm1, %xmm0 ; NHM-NEXT: retq @@ -51,12 +51,12 @@ ; SNB-NEXT: vrsqrtps %xmm0, %xmm1 ; SNB-NEXT: vmulps %xmm1, %xmm0, %xmm2 ; SNB-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm3 +; SNB-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; SNB-NEXT: vmulps %xmm1, %xmm2, %xmm1 ; SNB-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1 -; SNB-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; SNB-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; SNB-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; SNB-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 +; SNB-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; SNB-NEXT: vandps %xmm1, %xmm0, %xmm0 ; SNB-NEXT: retq ; @@ -87,31 +87,31 @@ define <8 x float> @v8f32_no_daz(<8 x float> %f) #0 { ; NHM-LABEL: v8f32_no_daz: ; NHM: # %bb.0: -; NHM-NEXT: movaps %xmm0, %xmm2 +; NHM-NEXT: movaps {{.*#+}} xmm6 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; NHM-NEXT: movaps {{.*#+}} xmm9 = [NaN,NaN,NaN,NaN] ; NHM-NEXT: rsqrtps %xmm0, %xmm3 -; NHM-NEXT: mulps %xmm3, %xmm0 +; NHM-NEXT: rsqrtps %xmm1, %xmm8 +; NHM-NEXT: movaps %xmm0, %xmm2 ; NHM-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; NHM-NEXT: movaps %xmm0, %xmm5 -; NHM-NEXT: mulps %xmm4, %xmm5 +; NHM-NEXT: movaps {{.*#+}} xmm10 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; NHM-NEXT: mulps %xmm3, %xmm0 -; NHM-NEXT: movaps {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; NHM-NEXT: addps %xmm3, %xmm0 -; NHM-NEXT: mulps %xmm5, %xmm0 -; NHM-NEXT: movaps {{.*#+}} xmm5 = [NaN,NaN,NaN,NaN] -; NHM-NEXT: andps %xmm5, %xmm2 -; NHM-NEXT: movaps {{.*#+}} xmm6 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; NHM-NEXT: andps %xmm9, %xmm2 ; NHM-NEXT: movaps %xmm6, %xmm7 ; NHM-NEXT: cmpleps %xmm2, %xmm7 -; NHM-NEXT: andps %xmm7, %xmm0 -; NHM-NEXT: rsqrtps %xmm1, %xmm7 ; NHM-NEXT: movaps %xmm1, %xmm2 -; NHM-NEXT: mulps %xmm7, %xmm2 +; NHM-NEXT: andps %xmm9, %xmm1 +; NHM-NEXT: mulps %xmm8, %xmm2 +; NHM-NEXT: cmpleps %xmm1, %xmm6 +; NHM-NEXT: movaps %xmm0, %xmm5 +; NHM-NEXT: mulps %xmm3, %xmm0 +; NHM-NEXT: mulps %xmm4, %xmm5 ; NHM-NEXT: mulps %xmm2, %xmm4 -; NHM-NEXT: mulps %xmm7, %xmm2 -; NHM-NEXT: addps %xmm3, %xmm2 +; NHM-NEXT: mulps %xmm8, %xmm2 +; NHM-NEXT: addps %xmm10, %xmm0 +; NHM-NEXT: addps %xmm10, %xmm2 +; NHM-NEXT: mulps %xmm5, %xmm0 ; NHM-NEXT: mulps %xmm4, %xmm2 -; NHM-NEXT: andps %xmm5, %xmm1 -; NHM-NEXT: cmpleps %xmm1, %xmm6 +; NHM-NEXT: andps %xmm7, %xmm0 ; NHM-NEXT: andps %xmm6, %xmm2 ; NHM-NEXT: movaps %xmm2, %xmm1 ; NHM-NEXT: retq @@ -121,12 +121,12 @@ ; SNB-NEXT: vrsqrtps %ymm0, %ymm1 ; SNB-NEXT: vmulps %ymm1, %ymm0, %ymm2 ; SNB-NEXT: vmulps {{.*}}(%rip), %ymm2, %ymm3 +; SNB-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; SNB-NEXT: vmulps %ymm1, %ymm2, %ymm1 ; SNB-NEXT: vaddps {{.*}}(%rip), %ymm1, %ymm1 -; SNB-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1 ; SNB-NEXT: vmovaps {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; SNB-NEXT: vcmpleps %ymm0, %ymm2, %ymm0 +; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1 ; SNB-NEXT: vandps %ymm1, %ymm0, %ymm0 ; SNB-NEXT: retq ; @@ -161,14 +161,14 @@ ; NHM: # %bb.0: ; NHM-NEXT: rsqrtss %xmm0, %xmm1 ; NHM-NEXT: movaps %xmm0, %xmm2 -; NHM-NEXT: mulss %xmm1, %xmm2 ; NHM-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; NHM-NEXT: mulss %xmm1, %xmm2 ; NHM-NEXT: mulss %xmm2, %xmm3 ; NHM-NEXT: mulss %xmm1, %xmm2 ; NHM-NEXT: addss {{.*}}(%rip), %xmm2 -; NHM-NEXT: mulss %xmm3, %xmm2 ; NHM-NEXT: xorps %xmm1, %xmm1 ; NHM-NEXT: cmpeqss %xmm1, %xmm0 +; NHM-NEXT: mulss %xmm3, %xmm2 ; NHM-NEXT: andnps %xmm2, %xmm0 ; NHM-NEXT: retq ; @@ -185,14 +185,14 @@ ; NHM: # %bb.0: ; NHM-NEXT: rsqrtps %xmm0, %xmm1 ; NHM-NEXT: movaps %xmm0, %xmm2 -; NHM-NEXT: mulps %xmm1, %xmm2 ; NHM-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; NHM-NEXT: mulps %xmm1, %xmm2 ; NHM-NEXT: mulps %xmm2, %xmm3 ; NHM-NEXT: mulps %xmm1, %xmm2 ; NHM-NEXT: addps {{.*}}(%rip), %xmm2 -; NHM-NEXT: mulps %xmm3, %xmm2 ; NHM-NEXT: xorps %xmm1, %xmm1 ; NHM-NEXT: cmpneqps %xmm1, %xmm0 +; NHM-NEXT: mulps %xmm3, %xmm2 ; NHM-NEXT: andps %xmm2, %xmm0 ; NHM-NEXT: retq ; @@ -203,9 +203,9 @@ ; SNB-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm3 ; SNB-NEXT: vmulps %xmm1, %xmm2, %xmm1 ; SNB-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1 -; SNB-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; SNB-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; SNB-NEXT: vcmpneqps %xmm2, %xmm0, %xmm0 +; SNB-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; SNB-NEXT: vandps %xmm1, %xmm0, %xmm0 ; SNB-NEXT: retq ; @@ -235,26 +235,26 @@ ; NHM-LABEL: v8f32_daz: ; NHM: # %bb.0: ; NHM-NEXT: rsqrtps %xmm0, %xmm2 +; NHM-NEXT: rsqrtps %xmm1, %xmm7 ; NHM-NEXT: movaps %xmm0, %xmm3 -; NHM-NEXT: mulps %xmm2, %xmm3 +; NHM-NEXT: movaps %xmm1, %xmm6 ; NHM-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; NHM-NEXT: movaps {{.*#+}} xmm8 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; NHM-NEXT: mulps %xmm2, %xmm3 +; NHM-NEXT: mulps %xmm7, %xmm6 ; NHM-NEXT: movaps %xmm3, %xmm5 -; NHM-NEXT: mulps %xmm4, %xmm5 ; NHM-NEXT: mulps %xmm2, %xmm3 -; NHM-NEXT: movaps {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; NHM-NEXT: addps %xmm2, %xmm3 +; NHM-NEXT: mulps %xmm4, %xmm5 +; NHM-NEXT: mulps %xmm6, %xmm4 +; NHM-NEXT: mulps %xmm7, %xmm6 +; NHM-NEXT: addps %xmm8, %xmm3 +; NHM-NEXT: addps %xmm8, %xmm6 ; NHM-NEXT: mulps %xmm5, %xmm3 ; NHM-NEXT: xorps %xmm5, %xmm5 ; NHM-NEXT: cmpneqps %xmm5, %xmm0 -; NHM-NEXT: andps %xmm3, %xmm0 -; NHM-NEXT: rsqrtps %xmm1, %xmm3 -; NHM-NEXT: movaps %xmm1, %xmm6 -; NHM-NEXT: mulps %xmm3, %xmm6 -; NHM-NEXT: mulps %xmm6, %xmm4 -; NHM-NEXT: mulps %xmm3, %xmm6 -; NHM-NEXT: addps %xmm2, %xmm6 -; NHM-NEXT: mulps %xmm4, %xmm6 ; NHM-NEXT: cmpneqps %xmm5, %xmm1 +; NHM-NEXT: mulps %xmm4, %xmm6 +; NHM-NEXT: andps %xmm3, %xmm0 ; NHM-NEXT: andps %xmm6, %xmm1 ; NHM-NEXT: retq ; @@ -265,9 +265,9 @@ ; SNB-NEXT: vmulps {{.*}}(%rip), %ymm2, %ymm3 ; SNB-NEXT: vmulps %ymm1, %ymm2, %ymm1 ; SNB-NEXT: vaddps {{.*}}(%rip), %ymm1, %ymm1 -; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1 ; SNB-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; SNB-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1 ; SNB-NEXT: vandps %ymm1, %ymm0, %ymm0 ; SNB-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -79,14 +79,14 @@ ; SSE: # %bb.0: ; SSE-NEXT: rsqrtss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: mulss %xmm1, %xmm2 ; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: cmpltss {{.*}}(%rip), %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm2 ; SSE-NEXT: mulss %xmm2, %xmm3 ; SSE-NEXT: mulss %xmm1, %xmm2 ; SSE-NEXT: addss {{.*}}(%rip), %xmm2 -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: mulss %xmm3, %xmm2 -; SSE-NEXT: cmpltss {{.*}}(%rip), %xmm0 ; SSE-NEXT: andnps %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -94,26 +94,26 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: finite_f32_estimate_ieee_ninf: ; AVX512: # %bb.0: ; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %k1 +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem ; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 ; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %k1 -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq @@ -140,14 +140,14 @@ ; SSE: # %bb.0: ; SSE-NEXT: rsqrtss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: mulss %xmm1, %xmm2 ; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-NEXT: mulss %xmm1, %xmm2 ; SSE-NEXT: mulss %xmm2, %xmm3 ; SSE-NEXT: mulss %xmm1, %xmm2 ; SSE-NEXT: addss {{.*}}(%rip), %xmm2 -; SSE-NEXT: mulss %xmm3, %xmm2 ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cmpeqss %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm3, %xmm2 ; SSE-NEXT: andnps %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -156,8 +156,8 @@ ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vcmpeqss %xmm2, %xmm0, %xmm0 @@ -223,14 +223,14 @@ ; SSE: # %bb.0: ; SSE-NEXT: rsqrtss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: mulss %xmm1, %xmm2 ; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: cmpltss {{.*}}(%rip), %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm2 ; SSE-NEXT: mulss %xmm2, %xmm3 ; SSE-NEXT: mulss %xmm1, %xmm2 ; SSE-NEXT: addss {{.*}}(%rip), %xmm2 -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: mulss %xmm3, %xmm2 -; SSE-NEXT: cmpltss {{.*}}(%rip), %xmm0 ; SSE-NEXT: andnps %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -238,26 +238,26 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: sqrtf_check_denorms_ninf: ; AVX512: # %bb.0: ; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %k1 +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem ; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 ; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vcmpltss {{.*}}(%rip), %xmm0, %k1 -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq @@ -284,15 +284,15 @@ ; SSE: # %bb.0: ; SSE-NEXT: rsqrtps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm2, %xmm1 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: mulps %xmm2, %xmm1 ; SSE-NEXT: mulps %xmm1, %xmm3 ; SSE-NEXT: mulps %xmm2, %xmm1 ; SSE-NEXT: addps {{.*}}(%rip), %xmm1 -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: mulps %xmm3, %xmm1 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; SSE-NEXT: cmpleps %xmm0, %xmm2 +; SSE-NEXT: mulps %xmm3, %xmm1 ; SSE-NEXT: andps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq @@ -302,28 +302,28 @@ ; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm3 +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: sqrt_v4f32_check_denorms_ninf: ; AVX512: # %bb.0: ; AVX512-NEXT: vrsqrtps %xmm0, %xmm1 -; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN] +; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vandps %xmm4, %xmm0, %xmm0 ; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 ; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] -; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; AVX512-NEXT: vcmpleps %xmm0, %xmm3, %xmm0 ; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %call = tail call ninf <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2 @@ -355,8 +355,8 @@ ; SSE-NEXT: rsqrtss %xmm0, %xmm1 ; SSE-NEXT: mulss %xmm1, %xmm0 ; SSE-NEXT: mulss %xmm1, %xmm0 -; SSE-NEXT: addss {{.*}}(%rip), %xmm0 ; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 +; SSE-NEXT: addss {{.*}}(%rip), %xmm0 ; SSE-NEXT: mulss %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -365,8 +365,8 @@ ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -415,8 +415,8 @@ ; SSE-NEXT: rsqrtps %xmm0, %xmm1 ; SSE-NEXT: mulps %xmm1, %xmm0 ; SSE-NEXT: mulps %xmm1, %xmm0 -; SSE-NEXT: addps {{.*}}(%rip), %xmm0 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 +; SSE-NEXT: addps {{.*}}(%rip), %xmm0 ; SSE-NEXT: mulps %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -425,16 +425,16 @@ ; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: v4f32_estimate: ; AVX512: # %bb.0: ; AVX512-NEXT: vrsqrtps %xmm0, %xmm1 -; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 @@ -478,18 +478,18 @@ ; SSE-LABEL: v8f32_estimate: ; SSE: # %bb.0: ; SSE-NEXT: rsqrtps %xmm0, %xmm2 +; SSE-NEXT: rsqrtps %xmm1, %xmm5 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; SSE-NEXT: movaps {{.*#+}} xmm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: mulps %xmm5, %xmm1 ; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: mulps %xmm5, %xmm1 ; SSE-NEXT: mulps %xmm3, %xmm2 -; SSE-NEXT: movaps {{.*#+}} xmm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; SSE-NEXT: mulps %xmm5, %xmm3 ; SSE-NEXT: addps %xmm4, %xmm0 -; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: rsqrtps %xmm1, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: mulps %xmm2, %xmm1 ; SSE-NEXT: addps %xmm4, %xmm1 +; SSE-NEXT: mulps %xmm2, %xmm0 ; SSE-NEXT: mulps %xmm3, %xmm1 ; SSE-NEXT: retq ; @@ -498,16 +498,16 @@ ; AVX1-NEXT: vrsqrtps %ymm0, %ymm1 ; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: v8f32_estimate: ; AVX512: # %bb.0: ; AVX512-NEXT: vrsqrtps %ymm0, %ymm1 -; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm1 * ymm0) + ymm2 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; AVX512-NEXT: vmulps %ymm0, %ymm1, %ymm0 @@ -521,18 +521,18 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 { ; SSE-LABEL: v16f32_no_estimate: ; SSE: # %bb.0: -; SSE-NEXT: sqrtps %xmm3, %xmm4 ; SSE-NEXT: sqrtps %xmm2, %xmm5 ; SSE-NEXT: sqrtps %xmm1, %xmm2 ; SSE-NEXT: sqrtps %xmm0, %xmm1 +; SSE-NEXT: sqrtps %xmm3, %xmm4 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: divps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: divps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: divps %xmm5, %xmm2 ; SSE-NEXT: divps %xmm4, %xmm3 +; SSE-NEXT: divps %xmm5, %xmm2 ; SSE-NEXT: retq ; ; AVX1-LABEL: v16f32_no_estimate: @@ -558,50 +558,50 @@ define <16 x float> @v16f32_estimate(<16 x float> %x) #1 { ; SSE-LABEL: v16f32_estimate: ; SSE: # %bb.0: -; SSE-NEXT: rsqrtps %xmm0, %xmm5 +; SSE-NEXT: rsqrtps %xmm1, %xmm7 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; SSE-NEXT: movaps {{.*#+}} xmm9 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; SSE-NEXT: rsqrtps %xmm0, %xmm5 +; SSE-NEXT: rsqrtps %xmm2, %xmm8 +; SSE-NEXT: mulps %xmm7, %xmm1 ; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: mulps %xmm5, %xmm0 +; SSE-NEXT: mulps %xmm8, %xmm2 ; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: mulps %xmm4, %xmm6 -; SSE-NEXT: movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; SSE-NEXT: addps %xmm5, %xmm0 +; SSE-NEXT: mulps %xmm7, %xmm1 +; SSE-NEXT: mulps %xmm4, %xmm7 +; SSE-NEXT: mulps %xmm8, %xmm2 +; SSE-NEXT: mulps %xmm5, %xmm0 +; SSE-NEXT: mulps %xmm4, %xmm8 +; SSE-NEXT: addps %xmm9, %xmm1 +; SSE-NEXT: addps %xmm9, %xmm0 +; SSE-NEXT: addps %xmm9, %xmm2 +; SSE-NEXT: mulps %xmm7, %xmm1 +; SSE-NEXT: rsqrtps %xmm3, %xmm7 ; SSE-NEXT: mulps %xmm6, %xmm0 -; SSE-NEXT: rsqrtps %xmm1, %xmm6 -; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: mulps %xmm4, %xmm6 -; SSE-NEXT: addps %xmm5, %xmm1 -; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: rsqrtps %xmm2, %xmm6 -; SSE-NEXT: mulps %xmm6, %xmm2 -; SSE-NEXT: mulps %xmm6, %xmm2 -; SSE-NEXT: mulps %xmm4, %xmm6 -; SSE-NEXT: addps %xmm5, %xmm2 -; SSE-NEXT: mulps %xmm6, %xmm2 -; SSE-NEXT: rsqrtps %xmm3, %xmm6 -; SSE-NEXT: mulps %xmm6, %xmm4 -; SSE-NEXT: mulps %xmm6, %xmm3 -; SSE-NEXT: mulps %xmm6, %xmm3 -; SSE-NEXT: addps %xmm5, %xmm3 +; SSE-NEXT: mulps %xmm8, %xmm2 +; SSE-NEXT: mulps %xmm7, %xmm3 +; SSE-NEXT: mulps %xmm7, %xmm4 +; SSE-NEXT: mulps %xmm7, %xmm3 +; SSE-NEXT: addps %xmm9, %xmm3 ; SSE-NEXT: mulps %xmm4, %xmm3 ; SSE-NEXT: retq ; ; AVX1-LABEL: v16f32_estimate: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtps %ymm0, %ymm2 +; AVX1-NEXT: vrsqrtps %ymm1, %ymm5 ; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmulps %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; AVX1-NEXT: vmulps %ymm3, %ymm5, %ymm3 ; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmulps %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vaddps %ymm6, %ymm0, %ymm0 +; AVX1-NEXT: vaddps %ymm6, %ymm1, %ymm1 ; AVX1-NEXT: vmulps %ymm0, %ymm4, %ymm0 -; AVX1-NEXT: vrsqrtps %ymm1, %ymm4 -; AVX1-NEXT: vmulps %ymm3, %ymm4, %ymm3 -; AVX1-NEXT: vmulps %ymm4, %ymm1, %ymm1 -; AVX1-NEXT: vmulps %ymm4, %ymm1, %ymm1 -; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmulps %ymm1, %ymm3, %ymm1 ; AVX1-NEXT: retq ; @@ -629,8 +629,8 @@ ; SSE-NEXT: rsqrtss %xmm1, %xmm2 ; SSE-NEXT: mulss %xmm2, %xmm1 ; SSE-NEXT: mulss %xmm2, %xmm1 -; SSE-NEXT: addss {{.*}}(%rip), %xmm1 ; SSE-NEXT: mulss {{.*}}(%rip), %xmm2 +; SSE-NEXT: addss {{.*}}(%rip), %xmm1 ; SSE-NEXT: mulss %xmm0, %xmm2 ; SSE-NEXT: mulss %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm0 @@ -643,8 +643,8 @@ ; AVX1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulss %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq @@ -677,8 +677,8 @@ ; SSE-NEXT: rsqrtps %xmm1, %xmm2 ; SSE-NEXT: mulps %xmm2, %xmm1 ; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: addps {{.*}}(%rip), %xmm1 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 +; SSE-NEXT: addps {{.*}}(%rip), %xmm1 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: mulps %xmm2, %xmm0 ; SSE-NEXT: retq @@ -690,8 +690,8 @@ ; AVX1-NEXT: vrsqrtps %xmm1, %xmm2 ; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -699,10 +699,10 @@ ; AVX512-LABEL: div_sqrt_fabs_v4f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmulps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vrsqrtps %xmm1, %xmm2 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 @@ -725,12 +725,12 @@ ; SSE-LABEL: div_sqrt_fabs_v4f32_fmf: ; SSE: # %bb.0: ; SSE-NEXT: rsqrtps %xmm2, %xmm3 +; SSE-NEXT: andps {{.*}}(%rip), %xmm1 ; SSE-NEXT: mulps %xmm3, %xmm2 ; SSE-NEXT: mulps %xmm3, %xmm2 -; SSE-NEXT: addps {{.*}}(%rip), %xmm2 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 +; SSE-NEXT: addps {{.*}}(%rip), %xmm2 ; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: andps {{.*}}(%rip), %xmm1 ; SSE-NEXT: divps %xmm1, %xmm3 ; SSE-NEXT: mulps %xmm3, %xmm0 ; SSE-NEXT: retq @@ -738,12 +738,12 @@ ; AVX1-LABEL: div_sqrt_fabs_v4f32_fmf: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtps %xmm2, %xmm3 +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulps %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vmulps %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vmulps {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vmulps %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vdivps %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -752,14 +752,14 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vrsqrtps %xmm2, %xmm3 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; AVX512-NEXT: vmulps %xmm4, %xmm3, %xmm4 +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm6 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm5 = [NaN,NaN,NaN,NaN] ; AVX512-NEXT: vmulps %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmulps %xmm4, %xmm3, %xmm4 +; AVX512-NEXT: vandps %xmm5, %xmm1, %xmm1 ; AVX512-NEXT: vmulps %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN] +; AVX512-NEXT: vaddps %xmm6, %xmm2, %xmm2 ; AVX512-NEXT: vmulps %xmm2, %xmm4, %xmm2 -; AVX512-NEXT: vandps %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vdivps %xmm1, %xmm2, %xmm1 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sse-domains.ll b/llvm/test/CodeGen/X86/sse-domains.ll --- a/llvm/test/CodeGen/X86/sse-domains.ll +++ b/llvm/test/CodeGen/X86/sse-domains.ll @@ -17,8 +17,8 @@ ; Materialize a zeroinitializer and a constant-pool load in the integer domain. ; The order is not important. -; CHECK: pxor ; CHECK: movdqa +; CHECK: pxor ; The instructions in the loop must all be integer domain as well. ; CHECK: while.body diff --git a/llvm/test/CodeGen/X86/sse_partial_update.ll b/llvm/test/CodeGen/X86/sse_partial_update.ll --- a/llvm/test/CodeGen/X86/sse_partial_update.ll +++ b/llvm/test/CodeGen/X86/sse_partial_update.ll @@ -51,8 +51,8 @@ ; CHECK-LABEL: sqrtss: ; CHECK: ## %bb.0: ; CHECK-NEXT: sqrtss %xmm0, %xmm1 -; CHECK-NEXT: cvtss2sd %xmm1, %xmm2 ; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: cvtss2sd %xmm1, %xmm2 ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtss2sd %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm2, %xmm0 @@ -71,8 +71,8 @@ ; CHECK-LABEL: sqrtsd: ; CHECK: ## %bb.0: ; CHECK-NEXT: sqrtsd %xmm0, %xmm1 -; CHECK-NEXT: cvtsd2ss %xmm1, %xmm2 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: cvtsd2ss %xmm1, %xmm2 ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtsd2ss %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/statepoint-vector.ll b/llvm/test/CodeGen/X86/statepoint-vector.ll --- a/llvm/test/CodeGen/X86/statepoint-vector.ll +++ b/llvm/test/CodeGen/X86/statepoint-vector.ll @@ -30,9 +30,9 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movq %rdi, %xmm1 +; CHECK-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; CHECK-NEXT: paddq %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movdqa %xmm1, (%rsp) ; CHECK-NEXT: callq do_safepoint ; CHECK-NEXT: .Ltmp1: @@ -59,8 +59,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: movaps (%rsi), %xmm0 +; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: movaps %xmm0, (%rsp) ; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: callq do_safepoint @@ -115,7 +115,7 @@ ; Check that we can lower a constant typed as i128 correctly. We don't have ; a representation of larger than 64 bit constant in the StackMap format. At ; the moment, this simply means spilling them, but there's a potential -; optimization for values representable as sext(Con64). +; optimization for values representable as sext(Con64). define void @test5() gc "statepoint-example" { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/tailcall-64.ll b/llvm/test/CodeGen/X86/tailcall-64.ll --- a/llvm/test/CodeGen/X86/tailcall-64.ll +++ b/llvm/test/CodeGen/X86/tailcall-64.ll @@ -217,8 +217,8 @@ define void @fold_indexed_load(i8* %mbstr, i64 %idxprom) nounwind uwtable ssp { ; CHECK-LABEL: fold_indexed_load: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: leaq (%rsi,%rsi,4), %rax ; CHECK-NEXT: movq _func_table@{{.*}}(%rip), %rcx +; CHECK-NEXT: leaq (%rsi,%rsi,4), %rax ; CHECK-NEXT: jmpq *16(%rcx,%rax,8) ## TAILCALL entry: %dsplen = getelementptr inbounds [0 x %struct.funcs], [0 x %struct.funcs]* @func_table, i64 0, i64 %idxprom, i32 2 @@ -244,15 +244,15 @@ define i32 @rdar12282281(i32 %n) nounwind uwtable ssp { ; CHECK-LABEL: rdar12282281: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: movslq %edi, %rax ; CHECK-NEXT: movq _funcs@{{.*}}(%rip), %rcx -; CHECK-NEXT: movq (%rcx,%rax,8), %r11 +; CHECK-NEXT: movslq %edi, %rax ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: xorl %r8d, %r8d ; CHECK-NEXT: xorl %r9d, %r9d +; CHECK-NEXT: movq (%rcx,%rax,8), %r11 +; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: jmpq *%r11 ## TAILCALL entry: diff --git a/llvm/test/CodeGen/X86/test-nofold.ll b/llvm/test/CodeGen/X86/test-nofold.ll --- a/llvm/test/CodeGen/X86/test-nofold.ll +++ b/llvm/test/CodeGen/X86/test-nofold.ll @@ -3,8 +3,8 @@ ; We want: ; CHECK: movl 4(%esp), %ecx -; CHECK-NEXT: andl $15, %ecx ; CHECK-NEXT: movl $42, %eax +; CHECK-NEXT: andl $15, %ecx ; CHECK-NEXT: cmovel %ecx, %eax ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/X86/unaligned-load.ll b/llvm/test/CodeGen/X86/unaligned-load.ll --- a/llvm/test/CodeGen/X86/unaligned-load.ll +++ b/llvm/test/CodeGen/X86/unaligned-load.ll @@ -71,8 +71,8 @@ ; I386-NEXT: .p2align 4, 0x90 ; I386-NEXT: LBB1_1: ## %bb ; I386-NEXT: ## =>This Inner Loop Header: Depth=1 -; I386-NEXT: movaps %xmm0, (%esp) ; I386-NEXT: movl $4673097, {{[0-9]+}}(%esp) ## imm = 0x474E49 +; I386-NEXT: movaps %xmm0, (%esp) ; I386-NEXT: movl $1230132307, {{[0-9]+}}(%esp) ## imm = 0x49525453 ; I386-NEXT: movl $541347367, {{[0-9]+}}(%esp) ## imm = 0x20444E27 ; I386-NEXT: movl $840969293, {{[0-9]+}}(%esp) ## imm = 0x32202C4D diff --git a/llvm/test/CodeGen/X86/vec_compare.ll b/llvm/test/CodeGen/X86/vec_compare.ll --- a/llvm/test/CodeGen/X86/vec_compare.ll +++ b/llvm/test/CodeGen/X86/vec_compare.ll @@ -79,15 +79,15 @@ ; CHECK-LABEL: test7: ; CHECK: ## %bb.0: ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] -; CHECK-NEXT: pxor %xmm2, %xmm1 ; CHECK-NEXT: pxor %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: pand %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: retl %C = icmp sgt <2 x i64> %A, %B @@ -99,15 +99,15 @@ ; CHECK-LABEL: test8: ; CHECK: ## %bb.0: ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] -; CHECK-NEXT: pxor %xmm2, %xmm0 ; CHECK-NEXT: pxor %xmm2, %xmm1 +; CHECK-NEXT: pxor %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pcmpeqd %xmm0, %xmm1 +; CHECK-NEXT: pcmpgtd %xmm0, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-NEXT: pand %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: retl %C = icmp slt <2 x i64> %A, %B @@ -119,15 +119,15 @@ ; CHECK-LABEL: test9: ; CHECK: ## %bb.0: ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] -; CHECK-NEXT: pxor %xmm2, %xmm0 ; CHECK-NEXT: pxor %xmm2, %xmm1 +; CHECK-NEXT: pxor %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pcmpeqd %xmm0, %xmm1 +; CHECK-NEXT: pcmpgtd %xmm0, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; CHECK-NEXT: pand %xmm3, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: por %xmm0, %xmm1 ; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 ; CHECK-NEXT: pxor %xmm1, %xmm0 @@ -141,15 +141,15 @@ ; CHECK-LABEL: test10: ; CHECK: ## %bb.0: ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] -; CHECK-NEXT: pxor %xmm2, %xmm1 ; CHECK-NEXT: pxor %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pand %xmm3, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: por %xmm0, %xmm1 ; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 ; CHECK-NEXT: pxor %xmm1, %xmm0 @@ -163,15 +163,15 @@ ; CHECK-LABEL: test11: ; CHECK: ## %bb.0: ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-NEXT: pxor %xmm2, %xmm1 ; CHECK-NEXT: pxor %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: pand %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: retl %C = icmp ugt <2 x i64> %A, %B @@ -183,15 +183,15 @@ ; CHECK-LABEL: test12: ; CHECK: ## %bb.0: ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-NEXT: pxor %xmm2, %xmm0 ; CHECK-NEXT: pxor %xmm2, %xmm1 +; CHECK-NEXT: pxor %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pcmpeqd %xmm0, %xmm1 +; CHECK-NEXT: pcmpgtd %xmm0, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-NEXT: pand %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: retl %C = icmp ult <2 x i64> %A, %B @@ -203,15 +203,15 @@ ; CHECK-LABEL: test13: ; CHECK: ## %bb.0: ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-NEXT: pxor %xmm2, %xmm0 ; CHECK-NEXT: pxor %xmm2, %xmm1 +; CHECK-NEXT: pxor %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pcmpeqd %xmm0, %xmm1 +; CHECK-NEXT: pcmpgtd %xmm0, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; CHECK-NEXT: pand %xmm3, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: por %xmm0, %xmm1 ; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 ; CHECK-NEXT: pxor %xmm1, %xmm0 @@ -225,15 +225,15 @@ ; CHECK-LABEL: test14: ; CHECK: ## %bb.0: ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-NEXT: pxor %xmm2, %xmm1 ; CHECK-NEXT: pxor %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pand %xmm3, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: por %xmm0, %xmm1 ; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 ; CHECK-NEXT: pxor %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-gep.ll b/llvm/test/CodeGen/X86/vector-gep.ll --- a/llvm/test/CodeGen/X86/vector-gep.ll +++ b/llvm/test/CodeGen/X86/vector-gep.ll @@ -48,8 +48,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: vpslld $2, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: movl %esp, %eax +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; CHECK-NEXT: popl %eax ; CHECK-NEXT: retl @@ -107,8 +107,8 @@ ; add the base to the offset ; CHECK-LABEL: AGEP8: ; CHECK: # %bb.0: -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retl %A = getelementptr i16, i16* %param, <4 x i32> %off @@ -122,29 +122,47 @@ ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: andl $-32, %esp ; CHECK-NEXT: subl $160, %esp -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm3 ; CHECK-NEXT: vbroadcastss 12(%ebp), %xmm5 -; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 -; CHECK-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm3 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovdqa 88(%ebp), %xmm4 +; CHECK-NEXT: vmovdqa 120(%ebp), %xmm6 +; CHECK-NEXT: vmovdqa 152(%ebp), %xmm7 +; CHECK-NEXT: movl 8(%ebp), %eax ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vpaddd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpaddd %xmm6, %xmm6, %xmm6 +; CHECK-NEXT: vpaddd %xmm7, %xmm7, %xmm7 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa 72(%ebp), %xmm3 +; CHECK-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpaddd %xmm6, %xmm5, %xmm6 +; CHECK-NEXT: vpaddd %xmm7, %xmm5, %xmm7 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm0 +; CHECK-NEXT: vmovdqa 104(%ebp), %xmm1 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpaddd %xmm2, %xmm2, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm0 +; CHECK-NEXT: vmovdqa 136(%ebp), %xmm2 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vpaddd %xmm1, %xmm5, %xmm1 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovdqa 40(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpaddd %xmm2, %xmm5, %xmm2 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill @@ -152,32 +170,14 @@ ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqa 72(%ebp), %xmm3 -; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa 184(%ebp), %xmm3 ; CHECK-NEXT: vmovdqa %xmm0, (%esp) # 16-byte Spill -; CHECK-NEXT: vmovdqa 88(%ebp), %xmm4 -; CHECK-NEXT: vpaddd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vmovdqa 104(%ebp), %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm5, %xmm1 -; CHECK-NEXT: vmovdqa 120(%ebp), %xmm6 -; CHECK-NEXT: vpaddd %xmm6, %xmm6, %xmm6 -; CHECK-NEXT: vpaddd %xmm6, %xmm5, %xmm6 -; CHECK-NEXT: vmovdqa 136(%ebp), %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm5, %xmm2 -; CHECK-NEXT: vmovdqa 152(%ebp), %xmm7 -; CHECK-NEXT: vpaddd %xmm7, %xmm7, %xmm7 -; CHECK-NEXT: vpaddd %xmm7, %xmm5, %xmm7 ; CHECK-NEXT: vmovdqa 168(%ebp), %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 -; CHECK-NEXT: vmovdqa 184(%ebp), %xmm3 ; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 -; CHECK-NEXT: movl 8(%ebp), %eax +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm3, 240(%eax) ; CHECK-NEXT: vmovdqa %xmm0, 224(%eax) ; CHECK-NEXT: vmovdqa %xmm7, 208(%eax) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -919,11 +919,11 @@ define double @test_v16f64(double %a0, <16 x double> %a1) { ; SSE-LABEL: test_v16f64: ; SSE: # %bb.0: +; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: addpd %xmm6, %xmm2 ; SSE-NEXT: addpd %xmm7, %xmm3 ; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: addpd %xmm3, %xmm1 -; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: addpd %xmm2, %xmm4 ; SSE-NEXT: addpd %xmm1, %xmm4 ; SSE-NEXT: movapd %xmm4, %xmm1 @@ -1133,9 +1133,9 @@ ; SSE: # %bb.0: ; SSE-NEXT: addpd %xmm6, %xmm2 ; SSE-NEXT: addpd %xmm4, %xmm0 -; SSE-NEXT: addpd %xmm2, %xmm0 ; SSE-NEXT: addpd %xmm7, %xmm3 ; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: addpd %xmm2, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 @@ -1340,9 +1340,9 @@ ; SSE: # %bb.0: ; SSE-NEXT: addpd %xmm6, %xmm2 ; SSE-NEXT: addpd %xmm4, %xmm0 -; SSE-NEXT: addpd %xmm2, %xmm0 ; SSE-NEXT: addpd %xmm7, %xmm3 ; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: addpd %xmm2, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -653,11 +653,11 @@ define double @test_v16f64(double %a0, <16 x double> %a1) { ; SSE-LABEL: test_v16f64: ; SSE: # %bb.0: +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm6, %xmm2 ; SSE-NEXT: mulpd %xmm7, %xmm3 ; SSE-NEXT: mulpd %xmm5, %xmm1 ; SSE-NEXT: mulpd %xmm3, %xmm1 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm2, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 ; SSE-NEXT: movapd %xmm4, %xmm1 @@ -795,9 +795,9 @@ ; SSE: # %bb.0: ; SSE-NEXT: mulpd %xmm6, %xmm2 ; SSE-NEXT: mulpd %xmm4, %xmm0 -; SSE-NEXT: mulpd %xmm2, %xmm0 ; SSE-NEXT: mulpd %xmm7, %xmm3 ; SSE-NEXT: mulpd %xmm5, %xmm1 +; SSE-NEXT: mulpd %xmm2, %xmm0 ; SSE-NEXT: mulpd %xmm3, %xmm1 ; SSE-NEXT: mulpd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 @@ -932,9 +932,9 @@ ; SSE: # %bb.0: ; SSE-NEXT: mulpd %xmm6, %xmm2 ; SSE-NEXT: mulpd %xmm4, %xmm0 -; SSE-NEXT: mulpd %xmm2, %xmm0 ; SSE-NEXT: mulpd %xmm7, %xmm3 ; SSE-NEXT: mulpd %xmm5, %xmm1 +; SSE-NEXT: mulpd %xmm2, %xmm0 ; SSE-NEXT: mulpd %xmm3, %xmm1 ; SSE-NEXT: mulpd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -283,16 +283,16 @@ ; SSE2-LABEL: combine_bitwise_ops_test1b: ; SSE2: # %bb.0: ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test1b: ; SSSE3: # %bb.0: ; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; @@ -317,16 +317,16 @@ ; SSE2-LABEL: combine_bitwise_ops_test2b: ; SSE2: # %bb.0: ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test2b: ; SSSE3: # %bb.0: ; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; @@ -1553,8 +1553,8 @@ ; SSE-LABEL: combine_test21: ; SSE: # %bb.0: ; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, (%rdi) ; SSE-NEXT: retq ; @@ -2605,15 +2605,15 @@ define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) { ; SSE2-LABEL: combine_constant_insertion_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: movaps {{.*#+}} xmm0 = +; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_constant_insertion_v4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movd %edi, %xmm1 ; SSSE3-NEXT: movaps {{.*#+}} xmm0 = +; SSSE3-NEXT: movd %edi, %xmm1 ; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSSE3-NEXT: retq ; @@ -2865,12 +2865,12 @@ ; SSE2-LABEL: shuffle_extract_insert_double: ; SSE2: # %bb.0: ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: retq @@ -2915,14 +2915,14 @@ ; SSE2-LABEL: shuffle_extract_concat_insert: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: retq ; @@ -2946,8 +2946,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX-NEXT: retq %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> %a0 = extractelement <8 x i16> %a, i32 0 @@ -2970,15 +2970,15 @@ ; SSE2-LABEL: shuffle_scalar_to_vector_extract: ; SSE2: # %bb.0: ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movsbl (%rsi), %esi +; SSE2-NEXT: movsbl (%rdx), %ecx ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movd %esi, %xmm0 ; SSE2-NEXT: psraw $8, %xmm1 ; SSE2-NEXT: pextrw $7, %xmm1, %eax ; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: movsbl (%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movsbl (%rdx), %eax -; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -2989,15 +2989,15 @@ ; SSSE3-LABEL: shuffle_scalar_to_vector_extract: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: movsbl (%rsi), %esi +; SSSE3-NEXT: movsbl (%rdx), %ecx ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movd %esi, %xmm0 ; SSSE3-NEXT: psraw $8, %xmm1 ; SSSE3-NEXT: pextrw $7, %xmm1, %eax ; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: movsbl (%rsi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movsbl (%rdx), %eax -; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSSE3-NEXT: pxor %xmm0, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -3008,6 +3008,7 @@ ; SSE41-LABEL: shuffle_scalar_to_vector_extract: ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 +; SSE41-NEXT: movsbl (%rsi), %esi ; SSE41-NEXT: pextrw $4, %xmm0, %eax ; SSE41-NEXT: pextrw $7, %xmm0, %ecx ; SSE41-NEXT: pxor %xmm0, %xmm0 @@ -3015,15 +3016,15 @@ ; SSE41-NEXT: movl $65531, %eax # imm = 0xFFFB ; SSE41-NEXT: pinsrw $2, %eax, %xmm0 ; SSE41-NEXT: pinsrw $4, %ecx, %xmm0 -; SSE41-NEXT: movsbl (%rsi), %eax -; SSE41-NEXT: pinsrw $5, %eax, %xmm0 -; SSE41-NEXT: movsbl (%rdx), %eax -; SSE41-NEXT: pinsrw $6, %eax, %xmm0 +; SSE41-NEXT: movsbl (%rdx), %ecx +; SSE41-NEXT: pinsrw $5, %esi, %xmm0 +; SSE41-NEXT: pinsrw $6, %ecx, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_scalar_to_vector_extract: ; AVX: # %bb.0: ; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX-NEXT: movsbl (%rsi), %esi ; AVX-NEXT: vpextrw $4, %xmm0, %eax ; AVX-NEXT: vpextrw $7, %xmm0, %ecx ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 @@ -3031,10 +3032,9 @@ ; AVX-NEXT: movl $65531, %eax # imm = 0xFFFB ; AVX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ; AVX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movsbl (%rsi), %eax -; AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; AVX-NEXT: movsbl (%rdx), %eax -; AVX-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 +; AVX-NEXT: movsbl (%rdx), %ecx +; AVX-NEXT: vpinsrw $5, %esi, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ; AVX-NEXT: retq %tmp = load <8 x i8>, <8 x i8>* %p0, align 1 %tmp1 = sext <8 x i8> %tmp to <8 x i16> @@ -3059,8 +3059,8 @@ ; SSE2-LABEL: PR43024: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; SSE2-NEXT: movaps %xmm0, (%rax) ; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm0, (%rax) ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] ; SSE2-NEXT: addss %xmm0, %xmm1 ; SSE2-NEXT: xorps %xmm0, %xmm0 @@ -3072,8 +3072,8 @@ ; SSSE3-LABEL: PR43024: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; SSSE3-NEXT: movaps %xmm0, (%rax) ; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSSE3-NEXT: movaps %xmm0, (%rax) ; SSSE3-NEXT: addss %xmm0, %xmm1 ; SSSE3-NEXT: xorps %xmm0, %xmm0 ; SSSE3-NEXT: addss %xmm0, %xmm1 @@ -3084,8 +3084,8 @@ ; SSE41-LABEL: PR43024: ; SSE41: # %bb.0: ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; SSE41-NEXT: movaps %xmm0, (%rax) ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm0, (%rax) ; SSE41-NEXT: addss %xmm0, %xmm1 ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: addss %xmm0, %xmm1 @@ -3096,9 +3096,9 @@ ; AVX-LABEL: PR43024: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vmovaps %xmm0, (%rax) ; AVX-NEXT: vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovss %xmm0, (%rax) @@ -3120,173 +3120,173 @@ ; SSE2-LABEL: PR45604: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movl $11, %r8d +; SSE2-NEXT: pextrw $2, %xmm1, %esi +; SSE2-NEXT: pextrw $3, %xmm1, %edx ; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pextrw $1, %xmm1, %ecx ; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: pextrw $4, %xmm1, %esi ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movl $11, %eax -; SSE2-NEXT: pinsrw $2, %eax, %xmm0 -; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pinsrw $2, %r8d, %xmm2 +; SSE2-NEXT: pinsrw $2, %r8d, %xmm0 +; SSE2-NEXT: pinsrw $4, %edx, %xmm2 +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: pextrw $6, %xmm1, %esi +; SSE2-NEXT: pextrw $5, %xmm1, %edx ; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 -; SSE2-NEXT: pinsrw $6, %eax, %xmm0 -; SSE2-NEXT: pextrw $2, %xmm1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: pinsrw $2, %eax, %xmm2 -; SSE2-NEXT: pextrw $3, %xmm1, %ecx -; SSE2-NEXT: pinsrw $4, %ecx, %xmm2 -; SSE2-NEXT: pinsrw $6, %eax, %xmm2 -; SSE2-NEXT: pextrw $4, %xmm1, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: pinsrw $2, %eax, %xmm3 -; SSE2-NEXT: pextrw $5, %xmm1, %ecx -; SSE2-NEXT: pinsrw $4, %ecx, %xmm3 -; SSE2-NEXT: pinsrw $6, %eax, %xmm3 -; SSE2-NEXT: pextrw $6, %xmm1, %ecx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: pinsrw $2, %eax, %xmm4 -; SSE2-NEXT: pextrw $7, %xmm1, %ecx -; SSE2-NEXT: pinsrw $4, %ecx, %xmm4 -; SSE2-NEXT: pinsrw $6, %eax, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 48(%rdi) -; SSE2-NEXT: movdqa %xmm3, 32(%rdi) +; SSE2-NEXT: pinsrw $2, %r8d, %xmm3 +; SSE2-NEXT: pinsrw $6, %r8d, %xmm0 +; SSE2-NEXT: pinsrw $6, %r8d, %xmm2 +; SSE2-NEXT: pinsrw $4, %edx, %xmm3 +; SSE2-NEXT: movd %esi, %xmm4 +; SSE2-NEXT: pextrw $7, %xmm1, %edx +; SSE2-NEXT: pinsrw $2, %r8d, %xmm4 ; SSE2-NEXT: movdqa %xmm2, 16(%rdi) ; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: pinsrw $6, %r8d, %xmm3 +; SSE2-NEXT: pinsrw $4, %edx, %xmm4 +; SSE2-NEXT: movdqa %xmm3, 32(%rdi) +; SSE2-NEXT: pinsrw $6, %r8d, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 48(%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: PR45604: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa (%rsi), %xmm1 +; SSSE3-NEXT: movl $11, %r8d +; SSSE3-NEXT: pextrw $2, %xmm1, %esi +; SSSE3-NEXT: pextrw $3, %xmm1, %edx ; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: pextrw $1, %xmm1, %ecx ; SSSE3-NEXT: movzwl %ax, %eax +; SSSE3-NEXT: movd %esi, %xmm2 +; SSSE3-NEXT: pextrw $4, %xmm1, %esi ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movl $11, %eax -; SSSE3-NEXT: pinsrw $2, %eax, %xmm0 -; SSSE3-NEXT: pextrw $1, %xmm1, %ecx +; SSSE3-NEXT: pinsrw $2, %r8d, %xmm2 +; SSSE3-NEXT: pinsrw $2, %r8d, %xmm0 +; SSSE3-NEXT: pinsrw $4, %edx, %xmm2 +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: pextrw $6, %xmm1, %esi +; SSSE3-NEXT: pextrw $5, %xmm1, %edx ; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 -; SSSE3-NEXT: pinsrw $6, %eax, %xmm0 -; SSSE3-NEXT: pextrw $2, %xmm1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: pinsrw $2, %eax, %xmm2 -; SSSE3-NEXT: pextrw $3, %xmm1, %ecx -; SSSE3-NEXT: pinsrw $4, %ecx, %xmm2 -; SSSE3-NEXT: pinsrw $6, %eax, %xmm2 -; SSSE3-NEXT: pextrw $4, %xmm1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: pinsrw $2, %eax, %xmm3 -; SSSE3-NEXT: pextrw $5, %xmm1, %ecx -; SSSE3-NEXT: pinsrw $4, %ecx, %xmm3 -; SSSE3-NEXT: pinsrw $6, %eax, %xmm3 -; SSSE3-NEXT: pextrw $6, %xmm1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: pinsrw $2, %eax, %xmm4 -; SSSE3-NEXT: pextrw $7, %xmm1, %ecx -; SSSE3-NEXT: pinsrw $4, %ecx, %xmm4 -; SSSE3-NEXT: pinsrw $6, %eax, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 48(%rdi) -; SSSE3-NEXT: movdqa %xmm3, 32(%rdi) +; SSSE3-NEXT: pinsrw $2, %r8d, %xmm3 +; SSSE3-NEXT: pinsrw $6, %r8d, %xmm0 +; SSSE3-NEXT: pinsrw $6, %r8d, %xmm2 +; SSSE3-NEXT: pinsrw $4, %edx, %xmm3 +; SSSE3-NEXT: movd %esi, %xmm4 +; SSSE3-NEXT: pextrw $7, %xmm1, %edx +; SSSE3-NEXT: pinsrw $2, %r8d, %xmm4 ; SSSE3-NEXT: movdqa %xmm2, 16(%rdi) ; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: pinsrw $6, %r8d, %xmm3 +; SSSE3-NEXT: pinsrw $4, %edx, %xmm4 +; SSSE3-NEXT: movdqa %xmm3, 32(%rdi) +; SSSE3-NEXT: pinsrw $6, %r8d, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 48(%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: PR45604: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rsi), %xmm1 +; SSE41-NEXT: movl $11, %r8d +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pextrw $4, %xmm1, %edx ; SSE41-NEXT: pextrw $2, %xmm1, %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: movl $11, %eax -; SSE41-NEXT: pinsrw $2, %eax, %xmm0 +; SSE41-NEXT: pextrw $5, %xmm1, %esi +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3,4,5,6,7] ; SSE41-NEXT: pextrw $3, %xmm1, %ecx +; SSE41-NEXT: pinsrw $2, %r8d, %xmm4 +; SSE41-NEXT: movd %edx, %xmm2 +; SSE41-NEXT: pextrw $6, %xmm1, %edx +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pinsrw $2, %r8d, %xmm2 +; SSE41-NEXT: pinsrw $2, %r8d, %xmm0 +; SSE41-NEXT: pinsrw $4, %esi, %xmm2 +; SSE41-NEXT: movd %edx, %xmm3 +; SSE41-NEXT: pextrw $1, %xmm1, %edx +; SSE41-NEXT: pextrw $7, %xmm1, %esi ; SSE41-NEXT: pinsrw $4, %ecx, %xmm0 -; SSE41-NEXT: pinsrw $6, %eax, %xmm0 -; SSE41-NEXT: pextrw $4, %xmm1, %ecx -; SSE41-NEXT: movd %ecx, %xmm2 -; SSE41-NEXT: pinsrw $2, %eax, %xmm2 -; SSE41-NEXT: pextrw $5, %xmm1, %ecx -; SSE41-NEXT: pinsrw $4, %ecx, %xmm2 -; SSE41-NEXT: pinsrw $6, %eax, %xmm2 -; SSE41-NEXT: pextrw $6, %xmm1, %ecx -; SSE41-NEXT: movd %ecx, %xmm3 -; SSE41-NEXT: pinsrw $2, %eax, %xmm3 -; SSE41-NEXT: pextrw $7, %xmm1, %ecx -; SSE41-NEXT: pinsrw $4, %ecx, %xmm3 -; SSE41-NEXT: pinsrw $6, %eax, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3,4,5,6,7] -; SSE41-NEXT: pinsrw $2, %eax, %xmm4 -; SSE41-NEXT: pextrw $1, %xmm1, %ecx -; SSE41-NEXT: pinsrw $4, %ecx, %xmm4 -; SSE41-NEXT: pinsrw $6, %eax, %xmm4 -; SSE41-NEXT: movdqa %xmm4, (%rdi) -; SSE41-NEXT: movdqa %xmm3, 48(%rdi) +; SSE41-NEXT: pinsrw $2, %r8d, %xmm3 +; SSE41-NEXT: pinsrw $6, %r8d, %xmm0 +; SSE41-NEXT: pinsrw $6, %r8d, %xmm2 +; SSE41-NEXT: pinsrw $4, %esi, %xmm3 +; SSE41-NEXT: pinsrw $4, %edx, %xmm4 ; SSE41-NEXT: movdqa %xmm2, 32(%rdi) ; SSE41-NEXT: movdqa %xmm0, 16(%rdi) +; SSE41-NEXT: pinsrw $6, %r8d, %xmm3 +; SSE41-NEXT: pinsrw $6, %r8d, %xmm4 +; SSE41-NEXT: movdqa %xmm4, (%rdi) +; SSE41-NEXT: movdqa %xmm3, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: PR45604: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vmovups %ymm1, 32(%rdi) +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rdi) -; AVX1-NEXT: vmovups %ymm1, 32(%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: PR45604: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,0,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,0,0,11,11,11,11,11,11,11,11] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,u,u,255,255,0,0,u,u,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2 ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rdi) ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, 32(%rdi) -; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rdi) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: PR45604: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,u,u,u,u,2,3,6,7,u,u,u,u,16,17,20,21,u,u,u,u,18,19,22,23,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,0,0,0,0,0,0,11,11,11,11,11,11,11,11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[4,5,0,1,u,u,u,u,6,7,2,3,u,u,u,u,20,21,16,17,u,u,u,u,22,23,18,19,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,0,0,0,0,0,0,11,11,11,11,11,11,11,11] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,u,u,u,u,10,11,14,15,u,u,u,u,24,25,28,29,u,u,u,u,26,27,30,31,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[12,13,8,9,u,u,u,u,14,15,10,11,u,u,u,u,28,29,24,25,u,u,u,u,30,31,26,27,u,u,u,u] ; AVX2-FAST-NEXT: vpblendvb %ymm4, {{.*}}(%rip), %ymm5, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,u,255,255,0,0,u,u,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,u,u,u,u,10,11,14,15,u,u,u,u,24,25,28,29,u,u,u,u,26,27,30,31,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[12,13,8,9,u,u,u,u,14,15,10,11,u,u,u,u,28,29,24,25,u,u,u,u,30,31,26,27,u,u,u,u] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7] diff --git a/llvm/test/CodeGen/X86/volatile-memstores-nooverlapping-load-stores.ll b/llvm/test/CodeGen/X86/volatile-memstores-nooverlapping-load-stores.ll --- a/llvm/test/CodeGen/X86/volatile-memstores-nooverlapping-load-stores.ll +++ b/llvm/test/CodeGen/X86/volatile-memstores-nooverlapping-load-stores.ll @@ -6,8 +6,8 @@ define dso_local void @copy_7_bytes(i8* noalias nocapture, i8* noalias nocapture readonly) nounwind #0 { ; CHECK-LABEL: copy_7_bytes: ; CHECK: # %bb.0: -; CHECK-NEXT: movl (%rsi), %eax ; CHECK-NEXT: movl 3(%rsi), %ecx +; CHECK-NEXT: movl (%rsi), %eax ; CHECK-NEXT: movl %ecx, 3(%rdi) ; CHECK-NEXT: movl %eax, (%rdi) ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vsplit-and.ll b/llvm/test/CodeGen/X86/vsplit-and.ll --- a/llvm/test/CodeGen/X86/vsplit-and.ll +++ b/llvm/test/CodeGen/X86/vsplit-and.ll @@ -23,34 +23,34 @@ define void @t2(<3 x i64>* %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind readonly { ; CHECK-LABEL: t2: ; CHECK: # %bb.0: +; CHECK-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: movq %r9, %xmm1 ; CHECK-NEXT: movq %r8, %xmm0 +; CHECK-NEXT: movq %rsi, %xmm2 +; CHECK-NEXT: pxor %xmm4, %xmm4 +; CHECK-NEXT: pcmpeqd %xmm5, %xmm5 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: movq %rdx, %xmm1 -; CHECK-NEXT: movq %rsi, %xmm2 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; CHECK-NEXT: movq %rcx, %xmm1 -; CHECK-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; CHECK-NEXT: pxor %xmm4, %xmm4 +; CHECK-NEXT: pcmpeqq %xmm4, %xmm0 ; CHECK-NEXT: pcmpeqq %xmm4, %xmm1 -; CHECK-NEXT: pcmpeqd %xmm5, %xmm5 -; CHECK-NEXT: pxor %xmm5, %xmm1 ; CHECK-NEXT: pcmpeqq %xmm4, %xmm2 +; CHECK-NEXT: pxor %xmm5, %xmm0 +; CHECK-NEXT: pxor %xmm5, %xmm1 ; CHECK-NEXT: pxor %xmm5, %xmm2 ; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] ; CHECK-NEXT: pcmpeqq %xmm4, %xmm3 ; CHECK-NEXT: pxor %xmm5, %xmm3 -; CHECK-NEXT: pcmpeqq %xmm4, %xmm0 -; CHECK-NEXT: pxor %xmm5, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; CHECK-NEXT: andps %xmm2, %xmm0 ; CHECK-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; CHECK-NEXT: psllq $63, %xmm1 -; CHECK-NEXT: psrad $31, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; CHECK-NEXT: psllq $63, %xmm1 ; CHECK-NEXT: psllq $63, %xmm0 +; CHECK-NEXT: psrad $31, %xmm1 ; CHECK-NEXT: psrad $31, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; CHECK-NEXT: movq %xmm0, 16(%rdi) ; CHECK-NEXT: movdqa %xmm1, (%rdi) diff --git a/llvm/test/CodeGen/X86/x86-cmov-converter.ll b/llvm/test/CodeGen/X86/x86-cmov-converter.ll --- a/llvm/test/CodeGen/X86/x86-cmov-converter.ll +++ b/llvm/test/CodeGen/X86/x86-cmov-converter.ll @@ -289,10 +289,10 @@ ; CHECK-LABEL: Transform ; CHECK-NOT: cmov -; CHECK: divl [[a:%[0-9a-z]*]] +; CHECK: movl %r8d, [[s2:%[0-9a-z]*]] +; CHECK: divl %r8d ; CHECK: movl $11, [[s1:%[0-9a-z]*]] -; CHECK: movl [[a]], [[s2:%[0-9a-z]*]] -; CHECK: cmpl [[a]], %edx +; CHECK: cmpl %r8d, %edx ; CHECK: ja [[SinkBB:.*]] ; CHECK: [[FalseBB:.*]]: ; CHECK: movl $22, [[s1]] diff --git a/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll b/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll --- a/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll +++ b/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll @@ -15,8 +15,8 @@ ; CHECK: subl $20, %esp ; CHECK: .cv_fpo_stackalloc 20 ; CHECK: .cv_fpo_endprologue -; CHECK: movl 28(%esp), %esi ; CHECK: ___security_cookie +; CHECK: movl 28(%esp), %esi ; CHECK: movl %esi, {{[0-9]*}}(%esp) ; CHECK: movl %esi, {{[0-9]*}}(%esp) @@ -30,7 +30,7 @@ ; CHECK: addl $20, %esp ; CHECK: popl %esi ; CHECK: retl -; CHECK: Ltmp2: +; CHECK: Ltmp3: ; CHECK: .cv_fpo_endproc ; ModuleID = 't.c' diff --git a/llvm/test/DebugInfo/COFF/inlining.ll b/llvm/test/DebugInfo/COFF/inlining.ll --- a/llvm/test/DebugInfo/COFF/inlining.ll +++ b/llvm/test/DebugInfo/COFF/inlining.ll @@ -170,8 +170,8 @@ ; OBJ: PtrEnd: 0x0 ; OBJ: Inlinee: bar (0x1002) ; OBJ: BinaryAnnotations [ -; OBJ-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0x8, LineOffset: 1} -; OBJ-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0x9, LineOffset: 1} +; OBJ-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0xA, LineOffset: 1} +; OBJ-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0x7, LineOffset: 1} ; OBJ-NEXT: ChangeLineOffset: 1 ; OBJ-NEXT: ChangeCodeOffset: 0x14 ; OBJ-NEXT: ChangeCodeLength: 0x7 diff --git a/llvm/test/DebugInfo/COFF/local-variables.ll b/llvm/test/DebugInfo/COFF/local-variables.ll --- a/llvm/test/DebugInfo/COFF/local-variables.ll +++ b/llvm/test/DebugInfo/COFF/local-variables.ll @@ -34,14 +34,14 @@ ; ASM: testl %ecx, %ecx ; ASM: je .LBB0_2 ; ASM: # %bb.1: # %if.then -; ASM: [[if_start:\.Ltmp.*]]: ; ASM: .cv_loc 0 1 9 9 # t.cpp:9:9 +; ASM: leaq 44(%rsp), %rcx +; ASM: [[if_start:\.Ltmp.*]]: ; ASM: movl $42, 40(%rsp) ; ASM: [[inline_site1:\.Ltmp.*]]: ; ASM: .cv_inline_site_id 1 within 0 inlined_at 1 10 5 ; ASM: .cv_loc 1 1 4 7 # t.cpp:4:7 ; ASM: movl $3, 44(%rsp) -; ASM: leaq 44(%rsp), %rcx ; ASM: .cv_loc 1 1 5 3 # t.cpp:5:3 ; ASM: callq capture ; ASM: leaq 40(%rsp), %rcx @@ -49,12 +49,13 @@ ; ASM: [[else_start:\.Ltmp.*]]: ; ASM: .LBB0_2: # %if.else ; ASM: .cv_loc 0 1 13 9 # t.cpp:13:9 +; ASM: leaq 48(%rsp), %rcx +; ASM: [[movl:\.Ltmp.*]]: ; ASM: movl $42, 36(%rsp) ; ASM: [[inline_site2:\.Ltmp.*]]: ; ASM: .cv_inline_site_id 2 within 0 inlined_at 1 14 5 ; ASM: .cv_loc 2 1 4 7 # t.cpp:4:7 ; ASM: movl $3, 48(%rsp) -; ASM: leaq 48(%rsp), %rcx ; ASM: .cv_loc 2 1 5 3 # t.cpp:5:3 ; ASM: callq capture ; ASM: leaq 36(%rsp), %rcx @@ -81,7 +82,7 @@ ; ASM: .long 116 # TypeIndex ; ASM: .short 0 # Flags ; ASM: .asciz "b" -; ASM: .cv_def_range [[else_start]] [[else_end]], frame_ptr_rel, 36 +; ASM: .cv_def_range [[movl]] [[else_end]], frame_ptr_rel, 36 ; ASM: .short 4429 # Record kind: S_INLINESITE ; ASM: .short 4414 # Record kind: S_LOCAL ; ASM: .long 116 # TypeIndex @@ -127,9 +128,9 @@ ; OBJ: DefRangeFramePointerRelSym { ; OBJ: Offset: 40 ; OBJ: LocalVariableAddrRange { -; OBJ: OffsetStart: .text+0xC +; OBJ: OffsetStart: .text+0x11 ; OBJ: ISectStart: 0x0 -; OBJ: Range: 0x21 +; OBJ: Range: 0x1C ; OBJ: } ; OBJ: } ; OBJ: LocalSym { @@ -141,9 +142,9 @@ ; OBJ: DefRangeFramePointerRelSym { ; OBJ: Offset: 36 ; OBJ: LocalVariableAddrRange { -; OBJ: OffsetStart: .text+0x2D +; OBJ: OffsetStart: .text+0x32 ; OBJ: ISectStart: 0x0 -; OBJ: Range: 0x1F +; OBJ: Range: 0x1A ; OBJ: } ; OBJ: } ; OBJ: InlineSiteSym { @@ -152,8 +153,8 @@ ; OBJ: Inlinee: will_be_inlined (0x1002) ; OBJ: BinaryAnnotations [ ; OBJ: ChangeLineOffset: 1 -; OBJ: ChangeCodeOffset: 0x14 -; OBJ: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0xD, LineOffset: 1} +; OBJ: ChangeCodeOffset: 0x19 +; OBJ: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0x8, LineOffset: 1} ; OBJ: ChangeCodeLength: 0xC ; OBJ: ] ; OBJ: } @@ -166,9 +167,9 @@ ; OBJ: DefRangeFramePointerRelSym { ; OBJ: Offset: 44 ; OBJ: LocalVariableAddrRange { -; OBJ: OffsetStart: .text+0x14 +; OBJ: OffsetStart: .text+0x19 ; OBJ: ISectStart: 0x0 -; OBJ: Range: 0x19 +; OBJ: Range: 0x14 ; OBJ: } ; OBJ: } ; OBJ: InlineSiteEnd { @@ -179,8 +180,8 @@ ; OBJ: Inlinee: will_be_inlined (0x1002) ; OBJ: BinaryAnnotations [ ; OBJ: ChangeLineOffset: 1 -; OBJ: ChangeCodeOffset: 0x35 -; OBJ: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0xD, LineOffset: 1} +; OBJ: ChangeCodeOffset: 0x3A +; OBJ: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0x8, LineOffset: 1} ; OBJ: ChangeCodeLength: 0xA ; OBJ: ] ; OBJ: } @@ -193,9 +194,9 @@ ; OBJ: DefRangeFramePointerRelSym { ; OBJ: Offset: 48 ; OBJ: LocalVariableAddrRange { -; OBJ: OffsetStart: .text+0x35 +; OBJ: OffsetStart: .text+0x3A ; OBJ: ISectStart: 0x0 -; OBJ: Range: 0x17 +; OBJ: Range: 0x12 ; OBJ: } ; OBJ: } ; OBJ: InlineSiteEnd { diff --git a/llvm/test/DebugInfo/COFF/pieces.ll b/llvm/test/DebugInfo/COFF/pieces.ll --- a/llvm/test/DebugInfo/COFF/pieces.ll +++ b/llvm/test/DebugInfo/COFF/pieces.ll @@ -48,16 +48,16 @@ ; ASM: .cv_loc 0 1 13 11 # t.c:13:11 ; ASM: movl %edi, %ecx ; ASM: callq g +; ASM: movl %esi, %ecx ; ASM: movl %eax, %edi ; ASM: [[ox_start:\.Ltmp[0-9]+]]: ; ASM: #DEBUG_VALUE: loop_csr:o <- [DW_OP_LLVM_fragment 0 32] $edi ; ASM: .cv_loc 0 1 14 11 # t.c:14:11 -; ASM: movl %esi, %ecx ; ASM: callq g ; ASM: movl %eax, %esi ; ASM: [[oy_start:\.Ltmp[0-9]+]]: ; ASM: #DEBUG_VALUE: loop_csr:o <- [DW_OP_LLVM_fragment 32 32] $esi -; ASM: cmpl n(%rip), %eax +; ASM: cmpl n(%rip), %ecx ; ASM: jl .LBB0_3 ; ASM: [[loopskip_start:\.Ltmp[0-9]+]]: ; ASM: #DEBUG_VALUE: loop_csr:o <- [DW_OP_LLVM_fragment 0 32] 0 @@ -96,8 +96,8 @@ ; ASM: retq ; ASM-LABEL: bitpiece_spill: # @bitpiece_spill -; ASM: #DEBUG_VALUE: bitpiece_spill:o <- [DW_OP_LLVM_fragment 0 32] 0 ; ASM: xorl %ecx, %ecx +; ASM: #DEBUG_VALUE: bitpiece_spill:o <- [DW_OP_LLVM_fragment 0 32] 0 ; ASM: callq g ; ASM: movl %eax, [[offset_o_x:[0-9]+]](%rsp) # 4-byte Spill ; ASM: [[spill_o_x_start:\.Ltmp[0-9]+]]: diff --git a/llvm/test/DebugInfo/COFF/register-variables.ll b/llvm/test/DebugInfo/COFF/register-variables.ll --- a/llvm/test/DebugInfo/COFF/register-variables.ll +++ b/llvm/test/DebugInfo/COFF/register-variables.ll @@ -36,14 +36,14 @@ ; ASM: je .LBB0_2 ; ASM: [[after_je:\.Ltmp.*]]: ; ASM: # %bb.1: # %if.then -; ASM-DAG: #DEBUG_VALUE: inlineinc:a <- $eax -; ASM-DAG: #DEBUG_VALUE: a <- $eax -; ASM-DAG: #DEBUG_VALUE: f:p <- $esi -; ASM: addl $1, %eax -; ASM: [[after_inc_eax:\.Ltmp.*]]: -; ASM: #DEBUG_VALUE: inlineinc:b <- $eax +; ASM: #DEBUG_VALUE: f:p <- $esi +; ASM: #DEBUG_VALUE: a <- $eax +; ASM: #DEBUG_VALUE: inlineinc:a <- $eax ; ASM: addl $1, x(%rip) +; ASM: [[after_inc_eax:\.Ltmp.*]]: +; ASM: addl $1, %eax ; ASM: [[after_if:\.Ltmp.*]]: +; ASM: #DEBUG_VALUE: inlineinc:b <- $eax ; ASM: .LBB0_2: # %if.else ; ASM: #DEBUG_VALUE: f:p <- $esi ; ASM: #DEBUG_VALUE: c <- $eax @@ -63,20 +63,20 @@ ; ASM: .cv_def_range [[after_if]] [[func_finished]], reg, 17 ; ASM: .short 4414 # Record kind: S_LOCAL ; ASM: .asciz "a" -; ASM: .cv_def_range [[after_je]] [[after_inc_eax]], reg, 17 +; ASM: .cv_def_range [[after_je]] [[after_if]], reg, 17 ; ASM: .short 4414 # Record kind: S_LOCAL ; ASM: .asciz "b" -; ASM: .cv_def_range [[after_if]] [[after_if]], reg, 17 +; ASM: .cv_def_range [[after_inc_eax]] [[after_if]], reg, 17 ; Note: "b" is a victim of tail de-duplication / branch folding. ; ASM: .short 4429 # Record kind: S_INLINESITE ; ASM: .short 4414 # Record kind: S_LOCAL ; ASM: .asciz "a" -; ASM: .cv_def_range [[after_je]] [[after_inc_eax]], reg, 17 +; ASM: .cv_def_range [[after_je]] [[after_if]], reg, 17 ; ASM: .short 4414 # Record kind: S_LOCAL ; ASM: .asciz "b" -; ASM: .cv_def_range [[after_inc_eax]] [[after_if]], reg, 17 +; ASM: .cv_def_range [[after_if]] [[after_if]], reg, 17 ; ASM: .short 4430 # Record kind: S_INLINESITE_END ; OBJ: Subsection [ @@ -152,7 +152,7 @@ ; OBJ: LocalVariableAddrRange { ; OBJ: OffsetStart: .text+0x10 ; OBJ: ISectStart: 0x0 -; OBJ: Range: 0x3 +; OBJ: Range: 0xA ; OBJ: } ; OBJ: } ; OBJ: LocalSym { @@ -164,9 +164,9 @@ ; OBJ: DefRangeRegisterSym { ; OBJ: Register: EAX (0x11) ; OBJ: LocalVariableAddrRange { -; OBJ: OffsetStart: .text+0x13 +; OBJ: OffsetStart: .text+0x1A ; OBJ: ISectStart: 0x0 -; OBJ: Range: 0x7 +; OBJ: Range: 0x0 ; OBJ: } ; OBJ: } ; OBJ: InlineSiteEnd { diff --git a/llvm/test/DebugInfo/X86/spill-nospill.ll b/llvm/test/DebugInfo/X86/spill-nospill.ll --- a/llvm/test/DebugInfo/X86/spill-nospill.ll +++ b/llvm/test/DebugInfo/X86/spill-nospill.ll @@ -28,9 +28,9 @@ ; CHECK: #APP ; CHECK: #NO_APP ; CHECK: callq g +; CHECK: movl %eax, %ecx ; CHECK: movl %eax, %[[CSR:[^ ]*]] ; CHECK: #DEBUG_VALUE: f:y <- $esi -; CHECK: movl %eax, %ecx ; CHECK: callq g ; CHECK: movl %[[CSR]], %ecx ; CHECK: callq g diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll --- a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -17,8 +17,8 @@ ; X64-LABEL: simple: ; X64: # %bb.0: # %entry ; X64-NEXT: movslq %edx, %rcx -; X64-NEXT: shlq $2, %rcx ; X64-NEXT: xorl %eax, %eax +; X64-NEXT: shlq $2, %rcx ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB0_1: # %loop ; X64-NEXT: # =>This Inner Loop Header: Depth=1 @@ -41,11 +41,11 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: shll $2, %edx ; X32-NEXT: xorl %eax, %eax +; X32-NEXT: shll $2, %edx ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB0_1: # %loop ; X32-NEXT: # =>This Inner Loop Header: Depth=1 @@ -97,9 +97,9 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: movslq %edx, %rcx ; X64-NEXT: movq %rcx, %rdx -; X64-NEXT: shlq $4, %rdx ; X64-NEXT: leaq (,%rcx,4), %rax ; X64-NEXT: leaq (%rax,%rax,2), %r8 +; X64-NEXT: shlq $4, %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB1_1: # %loop @@ -124,9 +124,9 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl %ecx, %edi -; X32-NEXT: shll $4, %edi ; X32-NEXT: leal (,%ecx,4), %eax ; X32-NEXT: leal (%eax,%eax,2), %ebx +; X32-NEXT: shll $4, %edi ; X32-NEXT: xorl %eax, %eax ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB1_1: # %loop @@ -190,16 +190,16 @@ ; X64-NEXT: testl %r9d, %r9d ; X64-NEXT: je .LBB2_3 ; X64-NEXT: # %bb.1: # %for.body.lr.ph -; X64-NEXT: leal (%rsi,%rsi), %r14d -; X64-NEXT: leal (%rsi,%rsi,2), %ebx ; X64-NEXT: addl %esi, %ecx +; X64-NEXT: leal (%rsi,%rsi,2), %ebx ; X64-NEXT: leal (,%rsi,4), %eax +; X64-NEXT: leal (%rsi,%rsi), %r14d ; X64-NEXT: leal (%rcx,%rsi,4), %ebp +; X64-NEXT: movslq %r8d, %rcx ; X64-NEXT: movslq %eax, %r10 ; X64-NEXT: movslq %ebx, %r11 ; X64-NEXT: movslq %r14d, %rbx ; X64-NEXT: movslq %esi, %rsi -; X64-NEXT: movslq %r8d, %rcx ; X64-NEXT: shlq $2, %rcx ; X64-NEXT: movslq %ebp, %rax ; X64-NEXT: .p2align 4, 0x90 @@ -210,8 +210,8 @@ ; X64-NEXT: addl (%rdi,%rbx), %ebp ; X64-NEXT: addl (%rdi,%r11), %ebp ; X64-NEXT: addl (%rdi,%r10), %ebp -; X64-NEXT: movl %ebp, (%rdx) ; X64-NEXT: addq %rax, %rdi +; X64-NEXT: movl %ebp, (%rdx) ; X64-NEXT: addq %rcx, %rdx ; X64-NEXT: decl %r9d ; X64-NEXT: jne .LBB2_2 @@ -232,10 +232,10 @@ ; X32-NEXT: je .LBB2_3 ; X32-NEXT: # %bb.1: # %for.body.lr.ph ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: addl %esi, %edi ; X32-NEXT: shll $2, %ecx ; X32-NEXT: .p2align 4, 0x90 @@ -249,9 +249,9 @@ ; X32-NEXT: addl (%esi,%ebx), %ebp ; X32-NEXT: addl %esi, %ebx ; X32-NEXT: addl (%esi,%ebx), %ebp -; X32-NEXT: movl %ebp, (%edx) ; X32-NEXT: addl %esi, %ebx ; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movl %ebp, (%edx) ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: decl %eax ; X32-NEXT: jne .LBB2_2 @@ -347,10 +347,10 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: movl $3, %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl $3, %eax ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB3_1: # %for.body ; X32-NEXT: # =>This Inner Loop Header: Depth=1 @@ -444,13 +444,13 @@ ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB4_1: # %for.body ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %eax, (%rdi,%rax,4) ; X64-NEXT: leal 1(%rax), %ecx +; X64-NEXT: leal 2(%rax), %r8d +; X64-NEXT: leal 3(%rax), %edx +; X64-NEXT: movl %eax, (%rdi,%rax,4) ; X64-NEXT: movl %ecx, 4(%rdi,%rax,4) -; X64-NEXT: leal 2(%rax), %ecx -; X64-NEXT: movl %ecx, 8(%rdi,%rax,4) -; X64-NEXT: leal 3(%rax), %ecx -; X64-NEXT: movl %ecx, 12(%rdi,%rax,4) +; X64-NEXT: movl %r8d, 8(%rdi,%rax,4) +; X64-NEXT: movl %edx, 12(%rdi,%rax,4) ; X64-NEXT: addq $4, %rax ; X64-NEXT: cmpl %esi, %eax ; X64-NEXT: jl .LBB4_1 @@ -460,14 +460,14 @@ ; X32-LABEL: multioper: ; X32: # %bb.0: # %entry ; X32-NEXT: pushl %esi -; X32-NEXT: xorl %eax, %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %eax, %eax ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB4_1: # %for.body ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl %eax, (%edx,%eax,4) ; X32-NEXT: leal 1(%eax), %esi +; X32-NEXT: movl %eax, (%edx,%eax,4) ; X32-NEXT: movl %esi, 4(%edx,%eax,4) ; X32-NEXT: leal 2(%eax), %esi ; X32-NEXT: movl %esi, 8(%edx,%eax,4) @@ -512,10 +512,10 @@ ; X64-LABEL: testCmpZero: ; X64: # %bb.0: # %entry ; X64-NEXT: movslq %edx, %rdx -; X64-NEXT: addq %rdx, %rdi ; X64-NEXT: movslq %ecx, %r9 -; X64-NEXT: addq %rsi, %r9 ; X64-NEXT: addl %edx, %r8d +; X64-NEXT: addq %rdx, %rdi +; X64-NEXT: addq %rsi, %r9 ; X64-NEXT: movslq %r8d, %rcx ; X64-NEXT: subq %rdx, %rcx ; X64-NEXT: xorl %edx, %edx @@ -534,11 +534,11 @@ ; X32: # %bb.0: # %entry ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: addl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: xorl %esi, %esi ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB5_1: # %for.body82.us