diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -60,8 +60,6 @@ def CH : X86Reg<"ch", 5>; def BH : X86Reg<"bh", 7>; -// X86-64 only, requires REX. -let CostPerUse = [1] in { def SIL : X86Reg<"sil", 6>; def DIL : X86Reg<"dil", 7>; def BPL : X86Reg<"bpl", 5>; @@ -74,7 +72,6 @@ def R13B : X86Reg<"r13b", 13>; def R14B : X86Reg<"r14b", 14>; def R15B : X86Reg<"r15b", 15>; -} let isArtificial = 1 in { // High byte of the low 16 bits of the super-register: @@ -125,9 +122,7 @@ } def IP : X86Reg<"ip", 0>; -// X86-64 only, requires REX. -let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CostPerUse = [1], - CoveredBySubRegs = 1 in { +let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CoveredBySubRegs = 1 in { def R8W : X86Reg<"r8w", 8, [R8B,R8BH]>; def R9W : X86Reg<"r9w", 9, [R9B,R9BH]>; def R10W : X86Reg<"r10w", 10, [R10B,R10BH]>; @@ -151,9 +146,7 @@ def EIP : X86Reg<"eip", 0, [IP, HIP]>, DwarfRegNum<[-2, 8, 8]>; } -// X86-64 only, requires REX -let SubRegIndices = [sub_16bit, sub_16bit_hi], CostPerUse = [1], - CoveredBySubRegs = 1 in { +let SubRegIndices = [sub_16bit, sub_16bit_hi], CoveredBySubRegs = 1 in { def R8D : X86Reg<"r8d", 8, [R8W,R8WH]>; def R9D : X86Reg<"r9d", 9, [R9W,R9WH]>; def R10D : X86Reg<"r10d", 10, [R10W,R10WH]>; @@ -175,8 +168,6 @@ def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>; def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>; -// These also require REX. -let CostPerUse = [1] in { def R8 : X86Reg<"r8", 8, [R8D]>, DwarfRegNum<[ 8, -2, -2]>; def R9 : X86Reg<"r9", 9, [R9D]>, DwarfRegNum<[ 9, -2, -2]>; def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>; @@ -186,7 +177,7 @@ def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>; def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>; def RIP : X86Reg<"rip", 0, [EIP]>, DwarfRegNum<[16, -2, -2]>; -}} +} // MMX Registers. These are actually aliased to ST0 .. ST7 def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>; @@ -218,8 +209,6 @@ def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>; def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>; -// X86-64 only -let CostPerUse = [1] in { def XMM8: X86Reg<"xmm8", 8>, DwarfRegNum<[25, -2, -2]>; def XMM9: X86Reg<"xmm9", 9>, DwarfRegNum<[26, -2, -2]>; def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>; @@ -246,8 +235,6 @@ def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[81, -2, -2]>; def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[82, -2, -2]>; -} // CostPerUse - // YMM0-15 registers, used by AVX instructions and // YMM16-31 registers, used by AVX-512 instructions. let SubRegIndices = [sub_xmm] in { diff --git a/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll b/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll --- a/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll +++ b/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll @@ -14,43 +14,43 @@ ; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: movslq (%rdi), %rdi -; CHECK-NEXT: movslq (%rsi), %r8 -; CHECK-NEXT: movslq (%rdx), %r10 -; CHECK-NEXT: movl (%rcx), %esi +; CHECK-NEXT: movslq (%rdi), %r8 +; CHECK-NEXT: movslq (%rsi), %rax +; CHECK-NEXT: movslq (%rdx), %rsi +; CHECK-NEXT: movl (%rcx), %edi ; CHECK-NEXT: movq %rsp, %rcx -; CHECK-NEXT: subl %edi, %r8d -; CHECK-NEXT: movslq %r8d, %rdx +; CHECK-NEXT: subl %r8d, %eax +; CHECK-NEXT: movslq %eax, %rdx ; CHECK-NEXT: js .LBB0_1 ; CHECK-NEXT: # %bb.11: # %b63 ; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: js .LBB0_14 ; CHECK-NEXT: # %bb.12: -; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: xorl %r8d, %r8d ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_13: # %a25b ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb %dil, %dil +; CHECK-NEXT: testb %r8b, %r8b ; CHECK-NEXT: je .LBB0_13 ; CHECK-NEXT: .LBB0_14: # %b85 -; CHECK-NEXT: movb $1, %al -; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: movb $1, %r8b +; CHECK-NEXT: testb %r8b, %r8b ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.15: -; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: xorl %r8d, %r8d ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_16: # %a25b140 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb %dil, %dil +; CHECK-NEXT: testb %r8b, %r8b ; CHECK-NEXT: je .LBB0_16 ; CHECK-NEXT: .LBB0_1: # %a29b -; CHECK-NEXT: cmpl %r10d, %esi +; CHECK-NEXT: cmpl %esi, %edi ; CHECK-NEXT: js .LBB0_10 ; CHECK-NEXT: # %bb.2: # %b158 ; CHECK-NEXT: movslq (%r9), %rsi ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: movb $1, %r9b +; CHECK-NEXT: movb $1, %r8b ; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_9: # %b1606 @@ -73,7 +73,7 @@ ; CHECK-NEXT: # Child Loop BB0_33 Depth 3 ; CHECK-NEXT: # Child Loop BB0_34 Depth 2 ; CHECK-NEXT: # Child Loop BB0_36 Depth 2 -; CHECK-NEXT: testl %r8d, %r8d +; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: js .LBB0_4 ; CHECK-NEXT: # %bb.17: # %b179 ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 @@ -87,7 +87,7 @@ ; CHECK-NEXT: je .LBB0_37 ; CHECK-NEXT: .LBB0_18: # %b188 ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: testb %r9b, %r9b +; CHECK-NEXT: testb %r8b, %r8b ; CHECK-NEXT: jne .LBB0_4 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_19: # %a30b294 @@ -97,23 +97,23 @@ ; CHECK-NEXT: je .LBB0_19 ; CHECK-NEXT: .LBB0_4: # %a33b ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: movl %esi, %r10d -; CHECK-NEXT: orl %r8d, %r10d +; CHECK-NEXT: movl %esi, %r9d +; CHECK-NEXT: orl %eax, %r9d ; CHECK-NEXT: jns .LBB0_20 ; CHECK-NEXT: .LBB0_5: # %a50b ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: shrl $31, %r10d -; CHECK-NEXT: movl %r8d, %eax -; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: shrl $31, %r9d +; CHECK-NEXT: movl %eax, %r10d +; CHECK-NEXT: orl %esi, %r10d ; CHECK-NEXT: jns .LBB0_26 ; CHECK-NEXT: .LBB0_6: # %a57b ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: testb %r10b, %r10b +; CHECK-NEXT: shrl $31, %r10d +; CHECK-NEXT: testb %r9b, %r9b ; CHECK-NEXT: je .LBB0_30 ; CHECK-NEXT: .LBB0_7: # %a66b ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: testb %r10b, %r10b ; CHECK-NEXT: jne .LBB0_8 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_34: # %a74b @@ -127,7 +127,7 @@ ; CHECK-NEXT: jne .LBB0_34 ; CHECK-NEXT: .LBB0_8: # %a93b ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: testl %r8d, %r8d +; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: js .LBB0_9 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_36: # %a97b @@ -183,7 +183,7 @@ ; CHECK-NEXT: je .LBB0_38 ; CHECK-NEXT: .LBB0_27: # %b879 ; CHECK-NEXT: # in Loop: Header=BB0_26 Depth=2 -; CHECK-NEXT: testb %r9b, %r9b +; CHECK-NEXT: testb %r8b, %r8b ; CHECK-NEXT: jne .LBB0_28 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_29: # %a53b1019 diff --git a/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll b/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll --- a/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll +++ b/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll @@ -28,23 +28,26 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: subq $32, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: .cfi_offset %rbx, -40 +; CHECK-NEXT: .cfi_offset %rbx, -48 +; CHECK-NEXT: .cfi_offset %r12, -40 ; CHECK-NEXT: .cfi_offset %r14, -32 ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsi, %r14 -; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: movq %rdi, %r14 ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi ; CHECK-NEXT: callq __ubyte_convert_to_ctype ; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: js LBB0_4 ; CHECK-NEXT: ## %bb.1: ## %cond_next.i ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: movq %r14, %rdi +; CHECK-NEXT: movq %rbx, %rdi ; CHECK-NEXT: callq __ubyte_convert_to_ctype ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: sarl $31, %ecx @@ -66,18 +69,18 @@ ; CHECK-NEXT: cmpl $-1, %eax ; CHECK-NEXT: je LBB0_3 ; CHECK-NEXT: LBB0_6: ## %bb35 -; CHECK-NEXT: movq _PyUFunc_API@GOTPCREL(%rip), %rbp -; CHECK-NEXT: movq (%rbp), %rax +; CHECK-NEXT: movq _PyUFunc_API@GOTPCREL(%rip), %r14 +; CHECK-NEXT: movq (%r14), %rax ; CHECK-NEXT: callq *216(%rax) ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: testb %dl, %dl ; CHECK-NEXT: je LBB0_11 ; CHECK-NEXT: ## %bb.7: ## %cond_false.i -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-NEXT: movzbl %bl, %ecx +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movzbl %sil, %ecx ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: divb %dl -; CHECK-NEXT: movl %eax, %r14d +; CHECK-NEXT: movl %eax, %r15d ; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: jne LBB0_12 ; CHECK-NEXT: jmp LBB0_14 @@ -91,26 +94,25 @@ ; CHECK-NEXT: movq 80(%rax), %rax ; CHECK-NEXT: LBB0_10: ## %bb4 ; CHECK-NEXT: movq 96(%rax), %rax -; CHECK-NEXT: movq %rbx, %rdi -; CHECK-NEXT: movq %r14, %rsi +; CHECK-NEXT: movq %r14, %rdi +; CHECK-NEXT: movq %rbx, %rsi ; CHECK-NEXT: callq *40(%rax) ; CHECK-NEXT: jmp LBB0_28 ; CHECK-NEXT: LBB0_11: ## %cond_true.i ; CHECK-NEXT: movl $4, %edi ; CHECK-NEXT: callq _feraiseexcept ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-NEXT: xorl %r14d, %r14d -; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: xorl %r15d, %r15d +; CHECK-NEXT: testb %sil, %sil ; CHECK-NEXT: je LBB0_14 ; CHECK-NEXT: LBB0_12: ## %cond_false.i ; CHECK-NEXT: testb %dl, %dl ; CHECK-NEXT: je LBB0_14 ; CHECK-NEXT: ## %bb.13: ## %cond_next17.i -; CHECK-NEXT: movzbl %bl, %eax +; CHECK-NEXT: movzbl %sil, %eax ; CHECK-NEXT: divb %dl -; CHECK-NEXT: movzbl %ah, %eax -; CHECK-NEXT: movl %eax, %r15d +; CHECK-NEXT: movzbl %ah, %ebx ; CHECK-NEXT: jmp LBB0_18 ; CHECK-NEXT: LBB0_14: ## %cond_true.i200 ; CHECK-NEXT: testb %dl, %dl @@ -119,15 +121,15 @@ ; CHECK-NEXT: movl $4, %edi ; CHECK-NEXT: callq _feraiseexcept ; CHECK-NEXT: LBB0_17: ## %ubyte_ctype_remainder.exit -; CHECK-NEXT: xorl %r15d, %r15d +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: LBB0_18: ## %ubyte_ctype_remainder.exit -; CHECK-NEXT: movq (%rbp), %rax +; CHECK-NEXT: movq (%r14), %rax ; CHECK-NEXT: callq *224(%rax) ; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: je LBB0_21 ; CHECK-NEXT: ## %bb.19: ## %cond_true61 -; CHECK-NEXT: movl %eax, %ebx -; CHECK-NEXT: movq (%rbp), %rax +; CHECK-NEXT: movl %eax, %ebp +; CHECK-NEXT: movq (%r14), %rax ; CHECK-NEXT: movq _.str5@GOTPCREL(%rip), %rdi ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdx @@ -137,11 +139,11 @@ ; CHECK-NEXT: js LBB0_27 ; CHECK-NEXT: ## %bb.20: ## %cond_next73 ; CHECK-NEXT: movl $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq (%rbp), %rax +; CHECK-NEXT: movq (%r14), %rax ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edi ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: movl %ebx, %edx +; CHECK-NEXT: movl %ebp, %edx ; CHECK-NEXT: callq *232(%rax) ; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: jne LBB0_27 @@ -151,40 +153,41 @@ ; CHECK-NEXT: testq %rax, %rax ; CHECK-NEXT: je LBB0_27 ; CHECK-NEXT: ## %bb.22: ## %cond_next97 -; CHECK-NEXT: movq %rax, %rbx -; CHECK-NEXT: movq _PyArray_API@GOTPCREL(%rip), %rbp -; CHECK-NEXT: movq (%rbp), %rax +; CHECK-NEXT: movq %rax, %r14 +; CHECK-NEXT: movq _PyArray_API@GOTPCREL(%rip), %r12 +; CHECK-NEXT: movq (%r12), %rax ; CHECK-NEXT: movq 200(%rax), %rdi ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: callq *304(%rdi) ; CHECK-NEXT: testq %rax, %rax ; CHECK-NEXT: je LBB0_25 ; CHECK-NEXT: ## %bb.23: ## %cond_next135 -; CHECK-NEXT: movb %r14b, 16(%rax) -; CHECK-NEXT: movq %rax, 24(%rbx) -; CHECK-NEXT: movq (%rbp), %rax +; CHECK-NEXT: movb %r15b, 16(%rax) +; CHECK-NEXT: movq %rax, 24(%r14) +; CHECK-NEXT: movq (%r12), %rax ; CHECK-NEXT: movq 200(%rax), %rdi ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: callq *304(%rdi) ; CHECK-NEXT: testq %rax, %rax ; CHECK-NEXT: je LBB0_25 ; CHECK-NEXT: ## %bb.24: ## %cond_next182 -; CHECK-NEXT: movb %r15b, 16(%rax) -; CHECK-NEXT: movq %rax, 32(%rbx) -; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: movb %bl, 16(%rax) +; CHECK-NEXT: movq %rax, 32(%r14) +; CHECK-NEXT: movq %r14, %rax ; CHECK-NEXT: jmp LBB0_28 ; CHECK-NEXT: LBB0_25: ## %cond_true113 -; CHECK-NEXT: decq (%rbx) +; CHECK-NEXT: decq (%r14) ; CHECK-NEXT: jne LBB0_27 ; CHECK-NEXT: ## %bb.26: ## %cond_true126 -; CHECK-NEXT: movq 8(%rbx), %rax -; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: movq 8(%r14), %rax +; CHECK-NEXT: movq %r14, %rdi ; CHECK-NEXT: callq *48(%rax) ; CHECK-NEXT: LBB0_27: ## %UnifiedReturnBlock ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: LBB0_28: ## %UnifiedReturnBlock -; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: addq $32, %rsp ; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/llvm/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll --- a/llvm/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll +++ b/llvm/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll @@ -16,16 +16,16 @@ ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movq %rsi, %r14 -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: orq $2097152, %r14 ## imm = 0x200000 -; CHECK-NEXT: andl $15728640, %r14d ## imm = 0xF00000 +; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: movq %rdi, %r14 +; CHECK-NEXT: orq $2097152, %rbx ## imm = 0x200000 +; CHECK-NEXT: andl $15728640, %ebx ## imm = 0xF00000 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_1: ## %bb4 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: callq _xxGetOffsetForCode -; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: movq %r14, %rdi ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: callq _xxCalculateMidType @@ -33,7 +33,7 @@ ; CHECK-NEXT: jne LBB0_1 ; CHECK-NEXT: ## %bb.2: ## %bb26 ; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: cmpl $1048576, %r14d ## imm = 0x100000 +; CHECK-NEXT: cmpl $1048576, %ebx ## imm = 0x100000 ; CHECK-NEXT: jne LBB0_1 ; CHECK-NEXT: ## %bb.3: ## %bb.i ; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1 diff --git a/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll b/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll --- a/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll +++ b/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll @@ -10,226 +10,220 @@ define fastcc i64 @foo() nounwind { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq X(%rip), %rcx ; CHECK-NEXT: movq X(%rip), %r9 -; CHECK-NEXT: movq X(%rip), %r15 -; CHECK-NEXT: movq X(%rip), %rax -; CHECK-NEXT: movq X(%rip), %rdx -; CHECK-NEXT: movq X(%rip), %r12 -; CHECK-NEXT: movq X(%rip), %r14 -; CHECK-NEXT: movq X(%rip), %r11 +; CHECK-NEXT: movq X(%rip), %r8 ; CHECK-NEXT: movq X(%rip), %rdi -; CHECK-NEXT: addq %r12, %rdi -; CHECK-NEXT: movq X(%rip), %rcx +; CHECK-NEXT: movq X(%rip), %rsi +; CHECK-NEXT: movq X(%rip), %rdx ; CHECK-NEXT: movq X(%rip), %rbx -; CHECK-NEXT: bswapq %rcx -; CHECK-NEXT: leaq (%r11,%r14), %rsi -; CHECK-NEXT: addq %r12, %rsi -; CHECK-NEXT: addq %rdi, %rsi -; CHECK-NEXT: addq %rcx, %rsi -; CHECK-NEXT: leaq (%r15,%r9), %r8 -; CHECK-NEXT: leaq (%r8,%rax), %r10 -; CHECK-NEXT: addq %rsi, %rdx +; CHECK-NEXT: movq X(%rip), %rax +; CHECK-NEXT: addq %rsi, %rax +; CHECK-NEXT: movq X(%rip), %r10 +; CHECK-NEXT: movq X(%rip), %r11 +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: leaq (%rbx,%rdx), %r14 +; CHECK-NEXT: addq %rsi, %r14 +; CHECK-NEXT: addq %rax, %r14 +; CHECK-NEXT: addq %r10, %r14 +; CHECK-NEXT: leaq (%r9,%rcx), %rax +; CHECK-NEXT: leaq (%rax,%r8), %r10 +; CHECK-NEXT: addq %r14, %rdi ; CHECK-NEXT: addq %r10, %r10 +; CHECK-NEXT: bswapq %r11 +; CHECK-NEXT: addq %r14, %r10 +; CHECK-NEXT: addq %rbx, %r11 +; CHECK-NEXT: leaq (%rsi,%rdx), %rbx +; CHECK-NEXT: addq %rdi, %rbx +; CHECK-NEXT: addq %rbx, %r11 +; CHECK-NEXT: addq %rax, %rax +; CHECK-NEXT: addq %r10, %rax +; CHECK-NEXT: movq X(%rip), %rbx +; CHECK-NEXT: addq %rdi, %r11 +; CHECK-NEXT: addq %r11, %r8 +; CHECK-NEXT: addq %r10, %rax +; CHECK-NEXT: addq %r11, %rax +; CHECK-NEXT: bswapq %rbx +; CHECK-NEXT: leaq (%rdi,%rsi), %r11 +; CHECK-NEXT: addq %r8, %r11 +; CHECK-NEXT: addq %rdx, %rbx +; CHECK-NEXT: addq %r11, %rbx +; CHECK-NEXT: leaq (%r10,%rcx), %rdx +; CHECK-NEXT: addq %rdx, %rdx +; CHECK-NEXT: addq %rax, %rdx +; CHECK-NEXT: movq X(%rip), %r11 +; CHECK-NEXT: addq %r8, %rbx +; CHECK-NEXT: addq %rbx, %r9 +; CHECK-NEXT: addq %rax, %rdx +; CHECK-NEXT: addq %rbx, %rdx +; CHECK-NEXT: bswapq %r11 +; CHECK-NEXT: leaq (%r8,%rdi), %rbx +; CHECK-NEXT: addq %r9, %rbx +; CHECK-NEXT: addq %rsi, %r11 +; CHECK-NEXT: addq %rbx, %r11 +; CHECK-NEXT: leaq (%rax,%r10), %rsi +; CHECK-NEXT: addq %rsi, %rsi +; CHECK-NEXT: addq %rdx, %rsi +; CHECK-NEXT: movq X(%rip), %rbx +; CHECK-NEXT: addq %r9, %r11 +; CHECK-NEXT: addq %r11, %rcx +; CHECK-NEXT: addq %rdx, %rsi +; CHECK-NEXT: addq %r11, %rsi ; CHECK-NEXT: bswapq %rbx -; CHECK-NEXT: addq %rsi, %r10 +; CHECK-NEXT: leaq (%r9,%r8), %r11 +; CHECK-NEXT: addq %rcx, %r11 +; CHECK-NEXT: addq %rdi, %rbx ; CHECK-NEXT: addq %r11, %rbx -; CHECK-NEXT: leaq (%r12,%r14), %rcx -; CHECK-NEXT: addq %rdx, %rcx +; CHECK-NEXT: leaq (%rdx,%rax), %rdi +; CHECK-NEXT: addq %rdi, %rdi +; CHECK-NEXT: addq %rsi, %rdi +; CHECK-NEXT: movq X(%rip), %r11 ; CHECK-NEXT: addq %rcx, %rbx +; CHECK-NEXT: addq %rbx, %r10 +; CHECK-NEXT: addq %rsi, %rdi +; CHECK-NEXT: addq %rbx, %rdi +; CHECK-NEXT: bswapq %r11 +; CHECK-NEXT: leaq (%rcx,%r9), %rbx +; CHECK-NEXT: addq %r10, %rbx +; CHECK-NEXT: addq %r8, %r11 +; CHECK-NEXT: addq %rbx, %r11 +; CHECK-NEXT: leaq (%rsi,%rdx), %r8 ; CHECK-NEXT: addq %r8, %r8 -; CHECK-NEXT: addq %r10, %r8 -; CHECK-NEXT: movq X(%rip), %rcx +; CHECK-NEXT: addq %rdi, %r8 +; CHECK-NEXT: movq X(%rip), %rbx +; CHECK-NEXT: addq %r10, %r11 +; CHECK-NEXT: addq %r11, %rax +; CHECK-NEXT: addq %rdi, %r8 +; CHECK-NEXT: addq %r11, %r8 +; CHECK-NEXT: bswapq %rbx +; CHECK-NEXT: leaq (%r10,%rcx), %r11 +; CHECK-NEXT: addq %rax, %r11 +; CHECK-NEXT: addq %r9, %rbx +; CHECK-NEXT: addq %r11, %rbx +; CHECK-NEXT: leaq (%rdi,%rsi), %r9 +; CHECK-NEXT: addq %r9, %r9 +; CHECK-NEXT: addq %r8, %r9 +; CHECK-NEXT: movq X(%rip), %r11 +; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rbx, %rdx +; CHECK-NEXT: addq %r8, %r9 +; CHECK-NEXT: addq %rbx, %r9 +; CHECK-NEXT: bswapq %r11 +; CHECK-NEXT: leaq (%rax,%r10), %rbx ; CHECK-NEXT: addq %rdx, %rbx -; CHECK-NEXT: addq %rbx, %rax -; CHECK-NEXT: addq %r10, %r8 -; CHECK-NEXT: addq %rbx, %r8 -; CHECK-NEXT: bswapq %rcx -; CHECK-NEXT: leaq (%rdx,%r12), %rsi -; CHECK-NEXT: addq %rax, %rsi -; CHECK-NEXT: addq %r14, %rcx -; CHECK-NEXT: addq %rsi, %rcx -; CHECK-NEXT: leaq (%r10,%r9), %rbx -; CHECK-NEXT: addq %rbx, %rbx -; CHECK-NEXT: addq %r8, %rbx -; CHECK-NEXT: movq X(%rip), %rdi -; CHECK-NEXT: addq %rax, %rcx -; CHECK-NEXT: addq %rcx, %r15 -; CHECK-NEXT: addq %r8, %rbx -; CHECK-NEXT: addq %rcx, %rbx -; CHECK-NEXT: bswapq %rdi -; CHECK-NEXT: leaq (%rax,%rdx), %rcx -; CHECK-NEXT: addq %r15, %rcx -; CHECK-NEXT: addq %r12, %rdi -; CHECK-NEXT: addq %rcx, %rdi -; CHECK-NEXT: leaq (%r8,%r10), %r12 -; CHECK-NEXT: addq %r12, %r12 -; CHECK-NEXT: addq %rbx, %r12 -; CHECK-NEXT: movq X(%rip), %rcx -; CHECK-NEXT: addq %r15, %rdi -; CHECK-NEXT: addq %rdi, %r9 -; CHECK-NEXT: addq %rbx, %r12 -; CHECK-NEXT: addq %rdi, %r12 -; CHECK-NEXT: bswapq %rcx -; CHECK-NEXT: leaq (%r15,%rax), %rdi -; CHECK-NEXT: addq %r9, %rdi -; CHECK-NEXT: addq %rdx, %rcx -; CHECK-NEXT: addq %rdi, %rcx -; CHECK-NEXT: leaq (%rbx,%r8), %r13 -; CHECK-NEXT: addq %r13, %r13 -; CHECK-NEXT: addq %r12, %r13 -; CHECK-NEXT: movq X(%rip), %rdx +; CHECK-NEXT: addq %rcx, %r11 +; CHECK-NEXT: addq %rbx, %r11 +; CHECK-NEXT: leaq (%r8,%rdi), %rcx +; CHECK-NEXT: addq %rcx, %rcx ; CHECK-NEXT: addq %r9, %rcx +; CHECK-NEXT: movq X(%rip), %rbx +; CHECK-NEXT: addq %rdx, %r11 +; CHECK-NEXT: addq %r11, %rsi +; CHECK-NEXT: addq %r9, %rcx +; CHECK-NEXT: addq %r11, %rcx +; CHECK-NEXT: bswapq %rbx +; CHECK-NEXT: leaq (%rdx,%rax), %r11 +; CHECK-NEXT: addq %rsi, %r11 +; CHECK-NEXT: addq %r10, %rbx +; CHECK-NEXT: addq %r11, %rbx +; CHECK-NEXT: leaq (%r9,%r8), %r10 +; CHECK-NEXT: addq %r10, %r10 ; CHECK-NEXT: addq %rcx, %r10 -; CHECK-NEXT: addq %r12, %r13 -; CHECK-NEXT: addq %rcx, %r13 -; CHECK-NEXT: bswapq %rdx -; CHECK-NEXT: leaq (%r9,%r15), %rcx -; CHECK-NEXT: addq %r10, %rcx -; CHECK-NEXT: addq %rax, %rdx -; CHECK-NEXT: addq %rcx, %rdx -; CHECK-NEXT: leaq (%r12,%rbx), %r14 -; CHECK-NEXT: addq %r14, %r14 -; CHECK-NEXT: addq %r13, %r14 -; CHECK-NEXT: movq X(%rip), %rax -; CHECK-NEXT: addq %r10, %rdx -; CHECK-NEXT: addq %rdx, %r8 -; CHECK-NEXT: addq %r13, %r14 -; CHECK-NEXT: addq %rdx, %r14 -; CHECK-NEXT: bswapq %rax -; CHECK-NEXT: leaq (%r10,%r9), %rcx -; CHECK-NEXT: addq %r8, %rcx -; CHECK-NEXT: addq %r15, %rax -; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: leaq (%r13,%r12), %r11 +; CHECK-NEXT: movq X(%rip), %r14 +; CHECK-NEXT: addq %rsi, %rbx +; CHECK-NEXT: addq %rbx, %rdi +; CHECK-NEXT: addq %rcx, %r10 +; CHECK-NEXT: addq %rbx, %r10 +; CHECK-NEXT: bswapq %r14 +; CHECK-NEXT: leaq (%rsi,%rdx), %r11 +; CHECK-NEXT: addq %rdi, %r11 +; CHECK-NEXT: addq %rax, %r14 +; CHECK-NEXT: addq %r11, %r14 +; CHECK-NEXT: leaq (%rcx,%r9), %r11 ; CHECK-NEXT: addq %r11, %r11 +; CHECK-NEXT: addq %r10, %r11 +; CHECK-NEXT: movq X(%rip), %rax +; CHECK-NEXT: addq %rdi, %r14 +; CHECK-NEXT: addq %r14, %r8 +; CHECK-NEXT: addq %r10, %r11 ; CHECK-NEXT: addq %r14, %r11 -; CHECK-NEXT: movq X(%rip), %rcx +; CHECK-NEXT: bswapq %rax +; CHECK-NEXT: leaq (%rdi,%rsi), %rbx +; CHECK-NEXT: addq %r8, %rbx +; CHECK-NEXT: addq %rdx, %rax +; CHECK-NEXT: addq %rbx, %rax +; CHECK-NEXT: leaq (%r10,%rcx), %rdx +; CHECK-NEXT: addq %rdx, %rdx +; CHECK-NEXT: addq %r11, %rdx +; CHECK-NEXT: movq X(%rip), %rbx ; CHECK-NEXT: addq %r8, %rax +; CHECK-NEXT: addq %rax, %r9 +; CHECK-NEXT: addq %r11, %rdx +; CHECK-NEXT: addq %rax, %rdx +; CHECK-NEXT: bswapq %rbx +; CHECK-NEXT: leaq (%r8,%rdi), %rax +; CHECK-NEXT: addq %r9, %rax +; CHECK-NEXT: addq %rsi, %rbx ; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: addq %r14, %r11 -; CHECK-NEXT: addq %rax, %r11 -; CHECK-NEXT: bswapq %rcx -; CHECK-NEXT: leaq (%r8,%r10), %rax -; CHECK-NEXT: addq %rbx, %rax -; CHECK-NEXT: addq %r9, %rcx -; CHECK-NEXT: addq %rax, %rcx -; CHECK-NEXT: leaq (%r14,%r13), %r9 -; CHECK-NEXT: addq %r9, %r9 -; CHECK-NEXT: addq %r11, %r9 -; CHECK-NEXT: movq X(%rip), %rax +; CHECK-NEXT: leaq (%r11,%r10), %rax +; CHECK-NEXT: addq %rax, %rax +; CHECK-NEXT: addq %rdx, %rax +; CHECK-NEXT: movq X(%rip), %r14 +; CHECK-NEXT: addq %r9, %rbx ; CHECK-NEXT: addq %rbx, %rcx -; CHECK-NEXT: addq %rcx, %r12 -; CHECK-NEXT: addq %r11, %r9 -; CHECK-NEXT: addq %rcx, %r9 -; CHECK-NEXT: bswapq %rax -; CHECK-NEXT: leaq (%rbx,%r8), %rcx -; CHECK-NEXT: addq %r12, %rcx -; CHECK-NEXT: addq %r10, %rax -; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: leaq (%r11,%r14), %r10 -; CHECK-NEXT: addq %r10, %r10 -; CHECK-NEXT: addq %r9, %r10 -; CHECK-NEXT: movq X(%rip), %rsi -; CHECK-NEXT: addq %r12, %rax -; CHECK-NEXT: addq %rax, %r13 -; CHECK-NEXT: addq %r9, %r10 -; CHECK-NEXT: addq %rax, %r10 -; CHECK-NEXT: bswapq %rsi -; CHECK-NEXT: leaq (%r12,%rbx), %rax -; CHECK-NEXT: addq %r13, %rax -; CHECK-NEXT: addq %r8, %rsi -; CHECK-NEXT: addq %rax, %rsi -; CHECK-NEXT: leaq (%r9,%r11), %rdx -; CHECK-NEXT: addq %rdx, %rdx -; CHECK-NEXT: addq %r10, %rdx -; CHECK-NEXT: movq X(%rip), %rax -; CHECK-NEXT: addq %r13, %rsi +; CHECK-NEXT: addq %rdx, %rax +; CHECK-NEXT: addq %rbx, %rax +; CHECK-NEXT: bswapq %r14 +; CHECK-NEXT: leaq (%r9,%r8), %rsi +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: addq %rdi, %r14 ; CHECK-NEXT: addq %rsi, %r14 -; CHECK-NEXT: addq %r10, %rdx -; CHECK-NEXT: addq %rsi, %rdx -; CHECK-NEXT: bswapq %rax -; CHECK-NEXT: leaq (%r13,%r12), %rsi +; CHECK-NEXT: leaq (%rdx,%r11), %rsi +; CHECK-NEXT: addq %rsi, %rsi +; CHECK-NEXT: addq %rax, %rsi +; CHECK-NEXT: movq X(%rip), %rdi +; CHECK-NEXT: addq %rcx, %r14 +; CHECK-NEXT: addq %r14, %r10 +; CHECK-NEXT: addq %rax, %rsi ; CHECK-NEXT: addq %r14, %rsi -; CHECK-NEXT: addq %rbx, %rax -; CHECK-NEXT: addq %rsi, %rax -; CHECK-NEXT: leaq (%r10,%r9), %r8 +; CHECK-NEXT: bswapq %rdi +; CHECK-NEXT: leaq (%rcx,%r9), %rbx +; CHECK-NEXT: addq %r10, %rbx +; CHECK-NEXT: addq %r8, %rdi +; CHECK-NEXT: addq %rbx, %rdi +; CHECK-NEXT: leaq (%rax,%rdx), %r8 ; CHECK-NEXT: addq %r8, %r8 -; CHECK-NEXT: addq %rdx, %r8 -; CHECK-NEXT: movq X(%rip), %rsi -; CHECK-NEXT: addq %r14, %rax -; CHECK-NEXT: addq %rax, %r11 -; CHECK-NEXT: addq %rdx, %r8 -; CHECK-NEXT: addq %rax, %r8 -; CHECK-NEXT: bswapq %rsi -; CHECK-NEXT: leaq (%r14,%r13), %rax -; CHECK-NEXT: addq %r11, %rax -; CHECK-NEXT: addq %r12, %rsi -; CHECK-NEXT: addq %rax, %rsi -; CHECK-NEXT: leaq (%rdx,%r10), %rax -; CHECK-NEXT: addq %rax, %rax -; CHECK-NEXT: addq %r8, %rax +; CHECK-NEXT: addq %rsi, %r8 +; CHECK-NEXT: addq %r10, %rdi +; CHECK-NEXT: addq %rdi, %r11 +; CHECK-NEXT: addq %rsi, %r8 +; CHECK-NEXT: addq %rdi, %r8 ; CHECK-NEXT: movq X(%rip), %rdi -; CHECK-NEXT: addq %r11, %rsi -; CHECK-NEXT: addq %rsi, %r9 -; CHECK-NEXT: addq %r8, %rax -; CHECK-NEXT: addq %rsi, %rax ; CHECK-NEXT: bswapq %rdi -; CHECK-NEXT: leaq (%r11,%r14), %rsi -; CHECK-NEXT: addq %r9, %rsi -; CHECK-NEXT: addq %r13, %rdi -; CHECK-NEXT: addq %rsi, %rdi -; CHECK-NEXT: leaq (%r8,%rdx), %rsi -; CHECK-NEXT: addq %rsi, %rsi -; CHECK-NEXT: addq %rax, %rsi -; CHECK-NEXT: movq X(%rip), %rcx ; CHECK-NEXT: addq %r9, %rdi -; CHECK-NEXT: addq %rdi, %r10 -; CHECK-NEXT: addq %rax, %rsi -; CHECK-NEXT: addq %rdi, %rsi -; CHECK-NEXT: bswapq %rcx -; CHECK-NEXT: leaq (%r9,%r11), %rdi -; CHECK-NEXT: addq %r10, %rdi -; CHECK-NEXT: addq %r14, %rcx -; CHECK-NEXT: addq %rdi, %rcx -; CHECK-NEXT: leaq (%rax,%r8), %rdi -; CHECK-NEXT: addq %rdi, %rdi -; CHECK-NEXT: addq %rsi, %rdi -; CHECK-NEXT: addq %r10, %rcx -; CHECK-NEXT: addq %rcx, %rdx -; CHECK-NEXT: addq %rsi, %rdi -; CHECK-NEXT: addq %rcx, %rdi -; CHECK-NEXT: movq X(%rip), %rcx -; CHECK-NEXT: bswapq %rcx -; CHECK-NEXT: addq %r11, %rcx -; CHECK-NEXT: leaq (%r10,%r9), %rbx -; CHECK-NEXT: addq %rdx, %rbx -; CHECK-NEXT: addq %rbx, %rcx +; CHECK-NEXT: leaq (%r10,%rcx), %r9 +; CHECK-NEXT: addq %r11, %r9 +; CHECK-NEXT: addq %r9, %rdi ; CHECK-NEXT: addq %rax, %rsi ; CHECK-NEXT: addq %rsi, %rsi +; CHECK-NEXT: addq %r8, %rsi +; CHECK-NEXT: addq %r8, %rsi +; CHECK-NEXT: addq %r11, %rdi +; CHECK-NEXT: addq %rdi, %rdx ; CHECK-NEXT: addq %rdi, %rsi -; CHECK-NEXT: addq %rdi, %rsi -; CHECK-NEXT: addq %rdx, %rcx -; CHECK-NEXT: addq %rcx, %r8 -; CHECK-NEXT: addq %rcx, %rsi ; CHECK-NEXT: movq X(%rip), %rax ; CHECK-NEXT: bswapq %rax -; CHECK-NEXT: addq %r10, %rdx +; CHECK-NEXT: addq %r10, %r11 ; CHECK-NEXT: movq %rax, X(%rip) -; CHECK-NEXT: addq %r9, %rax -; CHECK-NEXT: addq %r8, %rdx +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: addq %rdx, %r11 +; CHECK-NEXT: addq %r11, %rax ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: addq %r8, %rax ; CHECK-NEXT: addq %rsi, %rax ; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 ; CHECK-NEXT: retq %tmp = load volatile i64, ptr @X ; [#uses=7] %tmp1 = load volatile i64, ptr @X ; [#uses=5] diff --git a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll --- a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll +++ b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll @@ -4,21 +4,21 @@ define void @BZ2_bzDecompress_bb5_2E_outer_bb35_2E_i_bb54_2E_i(ptr, i32 %c_nblock_used.2.i, i32 %.reload51, ptr %.out, ptr %.out1, ptr %.out2, ptr %.out3) nounwind { ; CHECK-LABEL: BZ2_bzDecompress_bb5_2E_outer_bb35_2E_i_bb54_2E_i: ; CHECK: # %bb.0: # %newFuncRoot -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax ; CHECK-NEXT: movl %edx, %edx ; CHECK-NEXT: movl (%rdi,%rdx,4), %edx -; CHECK-NEXT: movzbl %dl, %eax -; CHECK-NEXT: addl $4, %eax +; CHECK-NEXT: movzbl %dl, %r10d +; CHECK-NEXT: addl $4, %r10d ; CHECK-NEXT: shrq $6, %rdx ; CHECK-NEXT: andl $67108860, %edx # imm = 0x3FFFFFC ; CHECK-NEXT: movl (%rdi,%rdx), %edx ; CHECK-NEXT: movzbl %dl, %edi ; CHECK-NEXT: shrl $8, %edx ; CHECK-NEXT: addl $5, %esi -; CHECK-NEXT: movl %eax, (%rcx) +; CHECK-NEXT: movl %r10d, (%rcx) ; CHECK-NEXT: movl %edi, (%r8) ; CHECK-NEXT: movl %edx, (%r9) -; CHECK-NEXT: movl %esi, (%r10) +; CHECK-NEXT: movl %esi, (%rax) ; CHECK-NEXT: retq newFuncRoot: br label %bb54.i diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll --- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll @@ -219,7 +219,7 @@ ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $1096, %rsp # imm = 0x448 -; CHECK-NEXT: movl %edi, %r14d +; CHECK-NEXT: movl %edi, %ebx ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovups %zmm0, (%rsp) ; CHECK-NEXT: movb $1, (%rsp) @@ -228,37 +228,37 @@ ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg (%rsp) -; CHECK-NEXT: testl %r14d, %r14d +; CHECK-NEXT: testl %ebx, %ebx ; CHECK-NEXT: jg .LBB2_4 ; CHECK-NEXT: # %bb.1: # %.preheader ; CHECK-NEXT: movl $7, %ebp -; CHECK-NEXT: movl $buf, %r15d -; CHECK-NEXT: movl $32, %r12d -; CHECK-NEXT: movw $8, %bx +; CHECK-NEXT: movl $buf, %r14d +; CHECK-NEXT: movl $32, %r15d +; CHECK-NEXT: movw $8, %r12w ; CHECK-NEXT: movl $buf+2048, %r13d ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: tileloadd (%r15,%r12), %tmm0 +; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tilestored %tmm0, 64(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm0 # 1024-byte Folded Reload -; CHECK-NEXT: tilestored %tmm0, (%r13,%r12) +; CHECK-NEXT: tilestored %tmm0, (%r13,%r15) ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: decl %ebp ; CHECK-NEXT: cmpl $7, %ebp ; CHECK-NEXT: jne .LBB2_2 ; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: cmpl $3, %r14d +; CHECK-NEXT: cmpl $3, %ebx ; CHECK-NEXT: jne .LBB2_4 ; CHECK-NEXT: # %bb.6: ; CHECK-NEXT: testl %ebp, %ebp ; CHECK-NEXT: jne .LBB2_5 ; CHECK-NEXT: # %bb.7: -; CHECK-NEXT: incl %r14d +; CHECK-NEXT: incl %ebx ; CHECK-NEXT: jmp .LBB2_8 ; CHECK-NEXT: .LBB2_4: ; CHECK-NEXT: callq foo @@ -269,9 +269,9 @@ ; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0 ; CHECK-NEXT: tilestored %tmm0, (%rcx,%rax) ; CHECK-NEXT: .LBB2_5: -; CHECK-NEXT: decl %r14d +; CHECK-NEXT: decl %ebx ; CHECK-NEXT: .LBB2_8: -; CHECK-NEXT: movl %r14d, %eax +; CHECK-NEXT: movl %ebx, %eax ; CHECK-NEXT: addq $1096, %rsp # imm = 0x448 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 @@ -297,15 +297,15 @@ ; IPRA-NEXT: jg .LBB2_4 ; IPRA-NEXT: # %bb.1: # %.preheader ; IPRA-NEXT: movl $7, %ecx -; IPRA-NEXT: movl $buf, %r8d +; IPRA-NEXT: movl $buf, %edx ; IPRA-NEXT: movl $32, %esi ; IPRA-NEXT: movw $8, %di -; IPRA-NEXT: movl $buf+2048, %edx +; IPRA-NEXT: movl $buf+2048, %r8d ; IPRA-NEXT: .p2align 4, 0x90 ; IPRA-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1 -; IPRA-NEXT: tileloadd (%r8,%rsi), %tmm0 +; IPRA-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; IPRA-NEXT: callq foo -; IPRA-NEXT: tilestored %tmm0, (%rdx,%rsi) +; IPRA-NEXT: tilestored %tmm0, (%r8,%rsi) ; IPRA-NEXT: callq foo ; IPRA-NEXT: decl %ecx ; IPRA-NEXT: cmpl $7, %ecx @@ -485,14 +485,14 @@ ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $1088, %rsp # imm = 0x440 -; CHECK-NEXT: movl %edi, %r15d +; CHECK-NEXT: movl %edi, %ebx ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovups %zmm0, (%rsp) ; CHECK-NEXT: movb $1, (%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $buf, %r14d -; CHECK-NEXT: movl $32, %ebx +; CHECK-NEXT: movl $32, %r15d ; CHECK-NEXT: movw $8, %bp ; CHECK-NEXT: movl $buf+2048, %r12d ; CHECK-NEXT: .p2align 4, 0x90 @@ -500,17 +500,17 @@ ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg (%rsp) -; CHECK-NEXT: testl %r15d, %r15d +; CHECK-NEXT: testl %ebx, %ebx ; CHECK-NEXT: jle .LBB3_3 ; CHECK-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 -; CHECK-NEXT: tileloadd (%r14,%rbx), %tmm0 +; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tilestored %tmm0, 64(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm0 # 1024-byte Folded Reload -; CHECK-NEXT: tilestored %tmm0, (%r12,%rbx) +; CHECK-NEXT: tilestored %tmm0, (%r12,%r15) ; CHECK-NEXT: callq foo ; CHECK-NEXT: jmp .LBB3_1 ; CHECK-NEXT: .LBB3_3: diff --git a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll --- a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -stop-before virtregrewriter | FileCheck %s define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row_from, i32 %c_row_to, i32 %c_row_tile, i32 %c_col_from, i32 %c_col_to, i32 %c_col_tile) { + ; Check LEA64_32r register is split to COPY10 ; CHECK-LABEL: name: foo ; CHECK: bb.0.entry: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) @@ -56,6 +57,7 @@ ; CHECK-NEXT: MOV64mr %stack.4, 1, $noreg, 0, $noreg, [[MOVSX64rr32_1]] :: (store (s64) into %stack.4) ; CHECK-NEXT: [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = MOVSX64rr32 %84.sub_32bit ; CHECK-NEXT: [[MOVSX64rm32_:%[0-9]+]]:gr64_nosp = MOVSX64rm32 %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY [[COPY]] ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = MOVSX64rr32 %88.sub_32bit ; CHECK-NEXT: [[MOVSX64rm32_1:%[0-9]+]]:gr64 = MOVSX64rm32 %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16) ; CHECK-NEXT: [[MOVSX64rr32_4:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm1]] @@ -63,13 +65,13 @@ ; CHECK-NEXT: [[MOVSX64rr32_6:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm]] ; CHECK-NEXT: MOV64mr %stack.8, 1, $noreg, 0, $noreg, [[MOVSX64rr32_6]] :: (store (s64) into %stack.8) ; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOVSX64rr32_4]] :: (store (s64) into %stack.6) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]] ; CHECK-NEXT: [[IMUL64rr:%[0-9]+]]:gr64_nosp = IMUL64rr [[IMUL64rr]], [[MOVSX64rr32_2]], implicit-def dead $eflags ; CHECK-NEXT: [[ADD64rr:%[0-9]+]]:gr64_nosp = ADD64rr [[ADD64rr]], [[MOVSX64rm32_]], implicit-def dead $eflags - ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[ADD64rr]], 0, $noreg + ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY2]], 4, [[ADD64rr]], 0, $noreg ; CHECK-NEXT: MOV64mr %stack.9, 1, $noreg, 0, $noreg, [[LEA64r]] :: (store (s64) into %stack.9) ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, [[MOVSX64rr32_5]] :: (store (s64) into %stack.7) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_5]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_5]] ; CHECK-NEXT: [[IMUL64rr:%[0-9]+]]:gr64 = IMUL64rr [[IMUL64rr]], [[MOVSX64rr32_2]], implicit-def dead $eflags ; CHECK-NEXT: [[SHL64ri:%[0-9]+]]:gr64 = SHL64ri [[SHL64ri]], 2, implicit-def dead $eflags ; CHECK-NEXT: MOV64mr %stack.10, 1, $noreg, 0, $noreg, [[SHL64ri]] :: (store (s64) into %stack.10) @@ -77,28 +79,29 @@ ; CHECK-NEXT: MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[MOVSX64rm32_]] :: (store (s64) into %stack.5) ; CHECK-NEXT: [[LEA64_32r2:%[0-9]+]]:gr32 = LEA64_32r %61, 4, [[MOVSX64rm32_]], 0, $noreg ; CHECK-NEXT: MOV32mr %stack.11, 1, $noreg, 0, $noreg, [[LEA64_32r2]] :: (store (s32) into %stack.11) - ; CHECK-NEXT: MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[LEA64r1]] :: (store (s64) into %stack.13) + ; CHECK-NEXT: MOV64mr %stack.12, 1, $noreg, 0, $noreg, [[LEA64r1]] :: (store (s64) into %stack.12) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.for.cond14.preheader: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[MOV32rm3:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8) ; CHECK-NEXT: CMP32rm [[MOV32rm3]], %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s32) from %fixed-stack.1, align 16) + ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) ; CHECK-NEXT: JCC_1 %bb.5, 13, implicit killed $eflags ; CHECK-NEXT: JMP_1 %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.for.body17.lr.ph: ; CHECK-NEXT: successors: %bb.6(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) + ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) ; CHECK-NEXT: [[IMUL64rr:%[0-9]+]]:gr64 = nsw IMUL64rr [[IMUL64rr]], [[MOVSX64rr32_]], implicit-def dead $eflags ; CHECK-NEXT: [[ADD64rm:%[0-9]+]]:gr64 = ADD64rm [[ADD64rm]], %stack.3, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.3) ; CHECK-NEXT: [[ADD64rm1:%[0-9]+]]:gr64 = ADD64rm [[ADD64rm1]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1) - ; CHECK-NEXT: MOV64mr %stack.12, 1, $noreg, 0, $noreg, [[ADD64rm1]] :: (store (s64) into %stack.12) + ; CHECK-NEXT: MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[ADD64rm1]] :: (store (s64) into %stack.13) ; CHECK-NEXT: [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm %stack.11, 1, $noreg, 0, $noreg :: (load (s32) from %stack.11) ; CHECK-NEXT: undef %68.sub_32bit:gr64_nosp = COPY [[MOV32rm4]] - ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9) - ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) + ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9) + ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) ; CHECK-NEXT: JMP_1 %bb.6 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4.for.cond.cleanup: @@ -107,10 +110,10 @@ ; CHECK-NEXT: bb.5.for.cond.cleanup16: ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.4(0x04000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) + ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) ; CHECK-NEXT: [[ADD64rm1:%[0-9]+]]:gr64 = ADD64rm [[ADD64rm1]], %stack.7, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.7) - ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10) - ; CHECK-NEXT: ADD64mr %stack.9, 1, $noreg, 0, $noreg, [[MOV64rm2]], implicit-def dead $eflags :: (store (s64) into %stack.9) + ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10) + ; CHECK-NEXT: ADD64mr %stack.9, 1, $noreg, 0, $noreg, [[MOV64rm3]], implicit-def dead $eflags :: (store (s64) into %stack.9) ; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[ADD64rm1]] :: (store (s64) into %stack.6) ; CHECK-NEXT: CMP64rm [[ADD64rm1]], %stack.8, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s64) from %stack.8) ; CHECK-NEXT: JCC_1 %bb.2, 12, implicit killed $eflags @@ -120,39 +123,39 @@ ; CHECK-NEXT: successors: %bb.6(0x7c000000), %bb.5(0x04000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV32rm2]].sub_16bit, %88.sub_16bit - ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.12, 1, $noreg, 0, $noreg :: (load (s64) from %stack.12) - ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm2]].sub_16bit, [[SUB32rr]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_]], 0, $noreg + ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13) + ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm2]].sub_16bit, [[SUB32rr]].sub_16bit, [[MOV64rm4]], 1, [[MOVSX64rr32_]], 0, $noreg ; CHECK-NEXT: [[MOVSX64rr32_7:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[MOVSX64rr32_7]].sub_32bit - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY %88 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]] - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY [[MOVSX64rm32_1]] - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr32 = COPY [[MOV32rm2]] - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr32 = COPY [[SUB32rr]] - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY [[COPY1]] - ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY9]], 1, [[MOVSX64rr32_7]], 0, $noreg - ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64_nosp = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) - ; Check LEA64_32r register is split to COPY10 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr32 = COPY [[LEA64_32r]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr32 = COPY [[SUB32rr]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr32 = COPY [[MOV32rm2]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr64 = COPY [[MOVSX64rm32_1]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY %88 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr64 = COPY [[COPY1]] + ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY10]], 1, [[MOVSX64rr32_7]], 0, $noreg + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr32 = COPY [[LEA64_32r]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr64_nosp = COPY [[MOV64rm]] ; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2) - ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm5]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[MOV64rm4]], 0, $noreg - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr32 = COPY [[COPY10]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr64 = COPY [[COPY9]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr32 = COPY [[COPY8]] - ; CHECK-NEXT: [[MOV64rm5:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13) - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr32 = COPY [[COPY7]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64_nosp = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gr64_nosp = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:gr64_nosp = COPY [[COPY3]] - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:gr64_nosp = COPY [[COPY2]] - ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY13]].sub_16bit, [[COPY11]].sub_16bit, [[COPY12]].sub_16bit, [[PTDPBSSDV]], [[PTILELOADDV]], [[PTILELOADDV1]] - ; CHECK-NEXT: PTILESTOREDV [[COPY13]].sub_16bit, [[COPY18]].sub_16bit, [[MOV64rm]], 1, [[COPY16]], 0, $noreg, [[PTDPBSSDV]] - ; CHECK-NEXT: [[ADD64rr1:%[0-9]+]]:gr64 = ADD64rr [[ADD64rr1]], [[COPY15]], implicit-def dead $eflags - ; CHECK-NEXT: [[ADD64rr2:%[0-9]+]]:gr64 = ADD64rr [[ADD64rr2]], [[MOV64rm5]], implicit-def dead $eflags - ; CHECK-NEXT: [[MOVSX64rr32_7]].sub_32bit:gr64_nosp = ADD32rr [[MOVSX64rr32_7]].sub_32bit, [[COPY11]], implicit-def dead $eflags - ; CHECK-NEXT: CMP64rr [[ADD64rr1]], [[COPY14]], implicit-def $eflags + ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm5]].sub_16bit, [[COPY11]].sub_16bit, [[LEA64r2]], 1, [[COPY12]], 0, $noreg + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr64 = COPY [[COPY12]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr32 = COPY [[COPY11]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64 = COPY [[COPY10]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64_nosp = COPY [[COPY9]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gr64_nosp = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:gr64_nosp = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:gr64_nosp = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:gr64 = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:gr32 = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:gr32 = COPY [[COPY3]] + ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.12, 1, $noreg, 0, $noreg :: (load (s64) from %stack.12) + ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY20]].sub_16bit, [[COPY14]].sub_16bit, [[COPY21]].sub_16bit, [[PTDPBSSDV]], [[PTILELOADDV]], [[PTILELOADDV1]] + ; CHECK-NEXT: PTILESTOREDV [[COPY20]].sub_16bit, [[COPY15]].sub_16bit, [[MOV64rm1]], 1, [[COPY17]], 0, $noreg, [[PTDPBSSDV]] + ; CHECK-NEXT: [[ADD64rr1:%[0-9]+]]:gr64 = ADD64rr [[ADD64rr1]], [[COPY18]], implicit-def dead $eflags + ; CHECK-NEXT: [[ADD64rr2:%[0-9]+]]:gr64 = ADD64rr [[ADD64rr2]], [[MOV64rm4]], implicit-def dead $eflags + ; CHECK-NEXT: [[MOVSX64rr32_7]].sub_32bit:gr64_nosp = ADD32rr [[MOVSX64rr32_7]].sub_32bit, [[COPY14]], implicit-def dead $eflags + ; CHECK-NEXT: CMP64rr [[ADD64rr1]], [[COPY19]], implicit-def $eflags ; CHECK-NEXT: JCC_1 %bb.6, 12, implicit killed $eflags ; CHECK-NEXT: JMP_1 %bb.5 entry: diff --git a/llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll b/llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll --- a/llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll @@ -18,20 +18,20 @@ ; CHECK-NEXT: movb $16, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $64, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movl $64, %r8d +; CHECK-NEXT: movl $64, %eax ; CHECK-NEXT: movw $64, %cx -; CHECK-NEXT: movw $16, %ax -; CHECK-NEXT: tileloadd (%rdi,%r8), %tmm0 +; CHECK-NEXT: movw $16, %r8w +; CHECK-NEXT: tileloadd (%rdi,%rax), %tmm0 ; CHECK-NEXT: addq $1024, %rdi # imm = 0x400 -; CHECK-NEXT: tileloadd (%rdi,%r8), %tmm1 -; CHECK-NEXT: tileloadd (%rdx,%r8), %tmm3 +; CHECK-NEXT: tileloadd (%rdi,%rax), %tmm1 +; CHECK-NEXT: tileloadd (%rdx,%rax), %tmm3 ; CHECK-NEXT: leaq 1024(%rdx), %rdi -; CHECK-NEXT: tileloadd (%rdi,%r8), %tmm2 -; CHECK-NEXT: tileloadd (%rsi,%r8), %tmm4 +; CHECK-NEXT: tileloadd (%rdi,%rax), %tmm2 +; CHECK-NEXT: tileloadd (%rsi,%rax), %tmm4 ; CHECK-NEXT: tdpbssd %tmm4, %tmm0, %tmm3 -; CHECK-NEXT: tilestored %tmm3, (%rdx,%r8) +; CHECK-NEXT: tilestored %tmm3, (%rdx,%rax) ; CHECK-NEXT: tdpbssd %tmm4, %tmm1, %tmm2 -; CHECK-NEXT: tilestored %tmm2, (%rdi,%r8) +; CHECK-NEXT: tilestored %tmm2, (%rdi,%rax) ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll --- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll @@ -237,15 +237,15 @@ ; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movl $buf, %r8d +; CHECK-NEXT: movl $buf, %ecx ; CHECK-NEXT: movl $32, %edx -; CHECK-NEXT: leal -1(%rsi), %ecx +; CHECK-NEXT: leal -1(%rsi), %r8d ; CHECK-NEXT: jmp .LBB4_1 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB4_3: # %if.false ; CHECK-NEXT: # in Loop: Header=BB4_1 Depth=1 -; CHECK-NEXT: movl %ecx, %esi -; CHECK-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, %esi +; CHECK-NEXT: movw %r8w, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: cmpw $7, %si ; CHECK-NEXT: jne .LBB4_5 ; CHECK-NEXT: .LBB4_1: # %loop.bb1 @@ -256,7 +256,7 @@ ; CHECK-NEXT: # %bb.2: # %if.true ; CHECK-NEXT: # in Loop: Header=BB4_1 Depth=1 ; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: tilestored %tmm0, (%r8,%rdx) +; CHECK-NEXT: tilestored %tmm0, (%rcx,%rdx) ; CHECK-NEXT: cmpw $7, %si ; CHECK-NEXT: je .LBB4_1 ; CHECK-NEXT: .LBB4_5: # %exit @@ -296,7 +296,7 @@ ; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: xorl %r8d, %r8d +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movl $buf, %ecx ; CHECK-NEXT: movl $32, %edx ; CHECK-NEXT: xorl %esi, %esi @@ -307,8 +307,8 @@ ; CHECK-NEXT: decl %esi ; CHECK-NEXT: .LBB5_4: # %loop.bb2 ; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1 -; CHECK-NEXT: leal (%rdi,%rsi), %eax -; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: leal (%rdi,%rsi), %r8d +; CHECK-NEXT: movw %r8w, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: cmpw $7, %si ; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; CHECK-NEXT: tilezero %tmm0 @@ -316,7 +316,7 @@ ; CHECK-NEXT: jne .LBB5_5 ; CHECK-NEXT: .LBB5_1: # %loop.bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb %r8b, %r8b +; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB5_3 ; CHECK-NEXT: # %bb.2: # %if.true ; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1 diff --git a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll --- a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll @@ -22,14 +22,14 @@ ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $64, %eax -; CHECK-NEXT: movw $8, %r14w +; CHECK-NEXT: movw $8, %bp ; CHECK-NEXT: tileloadd (%rdi,%rax), %tmm3 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB0_3 ; CHECK-NEXT: # %bb.1: # %loop.header.preheader ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: xorl %ebp, %ebp +; CHECK-NEXT: xorl %r14d, %r14d ; CHECK-NEXT: movl $32, %r15d ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_2: # %loop.header @@ -51,8 +51,8 @@ ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 ; CHECK-NEXT: tilestored %tmm2, (%rbx,%r15) -; CHECK-NEXT: incl %ebp -; CHECK-NEXT: cmpw $100, %bp +; CHECK-NEXT: incl %r14d +; CHECK-NEXT: cmpw $100, %r14w ; CHECK-NEXT: jl .LBB0_2 ; CHECK-NEXT: .LBB0_3: # %exit ; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8 @@ -112,8 +112,8 @@ ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB1_3 ; CHECK-NEXT: # %bb.1: # %loop.header.preheader -; CHECK-NEXT: movq %rdi, %r14 -; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: xorl %r14d, %r14d ; CHECK-NEXT: movl $32, %r15d ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_2: # %loop.header @@ -123,12 +123,12 @@ ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: tilezero %tmm2 -; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0 -; CHECK-NEXT: tileloadd (%r14,%r15), %tmm1 +; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm0 +; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm1 ; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: tilestored %tmm2, (%r14,%r15) -; CHECK-NEXT: incl %ebx -; CHECK-NEXT: cmpw $100, %bx +; CHECK-NEXT: tilestored %tmm2, (%rbx,%r15) +; CHECK-NEXT: incl %r14d +; CHECK-NEXT: cmpw $100, %r14w ; CHECK-NEXT: jl .LBB1_2 ; CHECK-NEXT: .LBB1_3: # %exit ; CHECK-NEXT: addq $72, %rsp diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll --- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll @@ -137,26 +137,26 @@ ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB1_3 ; CHECK-NEXT: # %bb.1: # %loop.header.preheader -; CHECK-NEXT: movq %rdi, %r15 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movl $32, %r14d -; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: xorl %r15d, %r15d ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_2: # %loop.header ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: tilestored %tmm0, (%r15,%r14) +; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: tileloadd (%r15,%r14), %tmm1 -; CHECK-NEXT: tileloadd (%r15,%r14), %tmm2 +; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1 +; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2 ; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; CHECK-NEXT: tilestored %tmm0, (%r15,%r14) +; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14) ; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: incl %ebx -; CHECK-NEXT: cmpw $100, %bx +; CHECK-NEXT: incl %r15d +; CHECK-NEXT: cmpw $100, %r15w ; CHECK-NEXT: jl .LBB1_2 ; CHECK-NEXT: .LBB1_3: # %exit ; CHECK-NEXT: addq $72, %rsp diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill.ll b/llvm/test/CodeGen/X86/AMX/amx-spill.ll --- a/llvm/test/CodeGen/X86/AMX/amx-spill.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-spill.ll @@ -28,30 +28,25 @@ ; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %dl, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movl $buf, %r8d +; CHECK-NEXT: movl $buf, %ecx ; CHECK-NEXT: movl $32, %eax -; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1 -; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1 -; CHECK-NEXT: movabsq $64, %rcx -; CHECK-NEXT: tilestored %tmm1, -64(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: tileloadd (%r8,%rax), %tmm3 -; CHECK-NEXT: tileloadd (%r8,%rax), %tmm4 -; CHECK-NEXT: tileloadd (%r8,%rax), %tmm2 -; CHECK-NEXT: tileloadd (%r8,%rax), %tmm5 -; CHECK-NEXT: tileloadd (%r8,%rax), %tmm0 +; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm1 +; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm1 +; CHECK-NEXT: movabsq $64, %r8 +; CHECK-NEXT: tilestored %tmm1, -64(%rsp,%r8) # 1024-byte Folded Spill +; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm3 +; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm4 +; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm2 +; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm5 +; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0 ; CHECK-NEXT: testl %edi, %edi -; CHECK-NEXT: je .LBB0_2 +; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: tileloadd (%r8,%rax), %tmm6 -; CHECK-NEXT: tileloadd (%r8,%rax), %tmm7 -; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1 -; CHECK-NEXT: jmp .LBB0_3 -; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: movl $buf2, %ecx +; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm6 ; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm7 ; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm1 -; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: tdpbssd %tmm7, %tmm6, %tmm1 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd -64(%rsp,%rax), %tmm7 # 1024-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll --- a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll +++ b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -452,30 +452,30 @@ define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) { ; BWON-LABEL: MergeLoadStoreBaseIndexOffset: ; BWON: # %bb.0: -; BWON-NEXT: movl %ecx, %r8d +; BWON-NEXT: movl %ecx, %eax ; BWON-NEXT: xorl %ecx, %ecx ; BWON-NEXT: .p2align 4, 0x90 ; BWON-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1 -; BWON-NEXT: movq (%rdi,%rcx,8), %rax -; BWON-NEXT: movzwl (%rdx,%rax), %eax -; BWON-NEXT: movw %ax, (%rsi,%rcx,2) +; BWON-NEXT: movq (%rdi,%rcx,8), %r8 +; BWON-NEXT: movzwl (%rdx,%r8), %r8d +; BWON-NEXT: movw %r8w, (%rsi,%rcx,2) ; BWON-NEXT: incq %rcx -; BWON-NEXT: cmpl %ecx, %r8d +; BWON-NEXT: cmpl %ecx, %eax ; BWON-NEXT: jne .LBB9_1 ; BWON-NEXT: # %bb.2: ; BWON-NEXT: retq ; ; BWOFF-LABEL: MergeLoadStoreBaseIndexOffset: ; BWOFF: # %bb.0: -; BWOFF-NEXT: movl %ecx, %r8d +; BWOFF-NEXT: movl %ecx, %eax ; BWOFF-NEXT: xorl %ecx, %ecx ; BWOFF-NEXT: .p2align 4, 0x90 ; BWOFF-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1 -; BWOFF-NEXT: movq (%rdi,%rcx,8), %rax -; BWOFF-NEXT: movw (%rdx,%rax), %ax -; BWOFF-NEXT: movw %ax, (%rsi,%rcx,2) +; BWOFF-NEXT: movq (%rdi,%rcx,8), %r8 +; BWOFF-NEXT: movw (%rdx,%r8), %r8w +; BWOFF-NEXT: movw %r8w, (%rsi,%rcx,2) ; BWOFF-NEXT: incq %rcx -; BWOFF-NEXT: cmpl %ecx, %r8d +; BWOFF-NEXT: cmpl %ecx, %eax ; BWOFF-NEXT: jne .LBB9_1 ; BWOFF-NEXT: # %bb.2: ; BWOFF-NEXT: retq @@ -509,30 +509,30 @@ define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i64 %n) { ; BWON-LABEL: MergeLoadStoreBaseIndexOffsetComplicated: ; BWON: # %bb.0: -; BWON-NEXT: xorl %r8d, %r8d +; BWON-NEXT: xorl %eax, %eax ; BWON-NEXT: .p2align 4, 0x90 ; BWON-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 -; BWON-NEXT: movsbq (%rsi), %rax -; BWON-NEXT: movzwl (%rdx,%rax), %eax -; BWON-NEXT: movw %ax, (%rdi,%r8) +; BWON-NEXT: movsbq (%rsi), %r8 +; BWON-NEXT: movzwl (%rdx,%r8), %r8d +; BWON-NEXT: movw %r8w, (%rdi,%rax) ; BWON-NEXT: incq %rsi -; BWON-NEXT: addq $2, %r8 -; BWON-NEXT: cmpq %rcx, %r8 +; BWON-NEXT: addq $2, %rax +; BWON-NEXT: cmpq %rcx, %rax ; BWON-NEXT: jl .LBB10_1 ; BWON-NEXT: # %bb.2: ; BWON-NEXT: retq ; ; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetComplicated: ; BWOFF: # %bb.0: -; BWOFF-NEXT: xorl %r8d, %r8d +; BWOFF-NEXT: xorl %eax, %eax ; BWOFF-NEXT: .p2align 4, 0x90 ; BWOFF-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 -; BWOFF-NEXT: movsbq (%rsi), %rax -; BWOFF-NEXT: movw (%rdx,%rax), %ax -; BWOFF-NEXT: movw %ax, (%rdi,%r8) +; BWOFF-NEXT: movsbq (%rsi), %r8 +; BWOFF-NEXT: movw (%rdx,%r8), %r8w +; BWOFF-NEXT: movw %r8w, (%rdi,%rax) ; BWOFF-NEXT: incq %rsi -; BWOFF-NEXT: addq $2, %r8 -; BWOFF-NEXT: cmpq %rcx, %r8 +; BWOFF-NEXT: addq $2, %rax +; BWOFF-NEXT: cmpq %rcx, %rax ; BWOFF-NEXT: jl .LBB10_1 ; BWOFF-NEXT: # %bb.2: ; BWOFF-NEXT: retq @@ -568,30 +568,30 @@ define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) { ; BWON-LABEL: MergeLoadStoreBaseIndexOffsetSext: ; BWON: # %bb.0: -; BWON-NEXT: movl %ecx, %r8d +; BWON-NEXT: movl %ecx, %eax ; BWON-NEXT: xorl %ecx, %ecx ; BWON-NEXT: .p2align 4, 0x90 ; BWON-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 -; BWON-NEXT: movsbq (%rdi,%rcx), %rax -; BWON-NEXT: movzwl (%rdx,%rax), %eax -; BWON-NEXT: movw %ax, (%rsi,%rcx,2) +; BWON-NEXT: movsbq (%rdi,%rcx), %r8 +; BWON-NEXT: movzwl (%rdx,%r8), %r8d +; BWON-NEXT: movw %r8w, (%rsi,%rcx,2) ; BWON-NEXT: incq %rcx -; BWON-NEXT: cmpl %ecx, %r8d +; BWON-NEXT: cmpl %ecx, %eax ; BWON-NEXT: jne .LBB11_1 ; BWON-NEXT: # %bb.2: ; BWON-NEXT: retq ; ; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetSext: ; BWOFF: # %bb.0: -; BWOFF-NEXT: movl %ecx, %r8d +; BWOFF-NEXT: movl %ecx, %eax ; BWOFF-NEXT: xorl %ecx, %ecx ; BWOFF-NEXT: .p2align 4, 0x90 ; BWOFF-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 -; BWOFF-NEXT: movsbq (%rdi,%rcx), %rax -; BWOFF-NEXT: movw (%rdx,%rax), %ax -; BWOFF-NEXT: movw %ax, (%rsi,%rcx,2) +; BWOFF-NEXT: movsbq (%rdi,%rcx), %r8 +; BWOFF-NEXT: movw (%rdx,%r8), %r8w +; BWOFF-NEXT: movw %r8w, (%rsi,%rcx,2) ; BWOFF-NEXT: incq %rcx -; BWOFF-NEXT: cmpl %ecx, %r8d +; BWOFF-NEXT: cmpl %ecx, %eax ; BWOFF-NEXT: jne .LBB11_1 ; BWOFF-NEXT: # %bb.2: ; BWOFF-NEXT: retq @@ -626,38 +626,38 @@ define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) { ; BWON-LABEL: loadStoreBaseIndexOffsetSextNoSex: ; BWON: # %bb.0: -; BWON-NEXT: movl %ecx, %r8d +; BWON-NEXT: movl %ecx, %eax ; BWON-NEXT: xorl %ecx, %ecx ; BWON-NEXT: .p2align 4, 0x90 ; BWON-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 -; BWON-NEXT: movsbq (%rdi,%rcx), %rax -; BWON-NEXT: movzbl (%rdx,%rax), %r9d -; BWON-NEXT: incl %eax -; BWON-NEXT: movsbq %al, %rax -; BWON-NEXT: movzbl (%rdx,%rax), %eax +; BWON-NEXT: movsbq (%rdi,%rcx), %r8 +; BWON-NEXT: movzbl (%rdx,%r8), %r9d +; BWON-NEXT: incl %r8d +; BWON-NEXT: movsbq %r8b, %r8 +; BWON-NEXT: movzbl (%rdx,%r8), %r8d ; BWON-NEXT: movb %r9b, (%rsi,%rcx,2) -; BWON-NEXT: movb %al, 1(%rsi,%rcx,2) +; BWON-NEXT: movb %r8b, 1(%rsi,%rcx,2) ; BWON-NEXT: incq %rcx -; BWON-NEXT: cmpl %ecx, %r8d +; BWON-NEXT: cmpl %ecx, %eax ; BWON-NEXT: jne .LBB12_1 ; BWON-NEXT: # %bb.2: ; BWON-NEXT: retq ; ; BWOFF-LABEL: loadStoreBaseIndexOffsetSextNoSex: ; BWOFF: # %bb.0: -; BWOFF-NEXT: movl %ecx, %r8d +; BWOFF-NEXT: movl %ecx, %eax ; BWOFF-NEXT: xorl %ecx, %ecx ; BWOFF-NEXT: .p2align 4, 0x90 ; BWOFF-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 -; BWOFF-NEXT: movsbq (%rdi,%rcx), %rax -; BWOFF-NEXT: movb (%rdx,%rax), %r9b -; BWOFF-NEXT: incl %eax -; BWOFF-NEXT: movsbq %al, %rax -; BWOFF-NEXT: movb (%rdx,%rax), %al +; BWOFF-NEXT: movsbq (%rdi,%rcx), %r8 +; BWOFF-NEXT: movb (%rdx,%r8), %r9b +; BWOFF-NEXT: incl %r8d +; BWOFF-NEXT: movsbq %r8b, %r8 +; BWOFF-NEXT: movb (%rdx,%r8), %r8b ; BWOFF-NEXT: movb %r9b, (%rsi,%rcx,2) -; BWOFF-NEXT: movb %al, 1(%rsi,%rcx,2) +; BWOFF-NEXT: movb %r8b, 1(%rsi,%rcx,2) ; BWOFF-NEXT: incq %rcx -; BWOFF-NEXT: cmpl %ecx, %r8d +; BWOFF-NEXT: cmpl %ecx, %eax ; BWOFF-NEXT: jne .LBB12_1 ; BWOFF-NEXT: # %bb.2: ; BWOFF-NEXT: retq @@ -921,11 +921,12 @@ } define i32 @merge_store_load_store_seq(i32* %buff) { -entry: ; CHECK-LABEL: merge_store_load_store_seq: -; CHECK: movl 4(%rdi), %eax -; CHECK-NEXT: movq $0, (%rdi) -; CHECK-NEXT: retq +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl 4(%rdi), %eax +; CHECK-NEXT: movq $0, (%rdi) +; CHECK-NEXT: retq +entry: store i32 0, i32* %buff, align 4 %arrayidx1 = getelementptr inbounds i32, i32* %buff, i64 1 @@ -935,12 +936,13 @@ } define i32 @merge_store_alias(i32* %buff, i32* %other) { -entry: ; CHECK-LABEL: merge_store_alias: -; CHECK: movl $0, (%rdi) -; CHECK-NEXT: movl (%rsi), %eax -; CHECK-NEXT: movl $0, 4(%rdi) -; CHECK-NEXT: retq +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl $0, (%rdi) +; CHECK-NEXT: movl (%rsi), %eax +; CHECK-NEXT: movl $0, 4(%rdi) +; CHECK-NEXT: retq +entry: store i32 0, i32* %buff, align 4 %arrayidx1 = getelementptr inbounds i32, i32* %buff, i64 1 diff --git a/llvm/test/CodeGen/X86/StackColoring.ll b/llvm/test/CodeGen/X86/StackColoring.ll --- a/llvm/test/CodeGen/X86/StackColoring.ll +++ b/llvm/test/CodeGen/X86/StackColoring.ll @@ -82,9 +82,9 @@ } ;CHECK-LABEL: myCall_w4: -;YESCOLOR: subq $120, %rsp -;NOFIRSTUSE: subq $200, %rsp -;NOCOLOR: subq $408, %rsp +;YESCOLOR: subq $112, %rsp +;NOFIRSTUSE: subq $208, %rsp +;NOCOLOR: subq $400, %rsp define i32 @myCall_w4(i32 %in) { entry: diff --git a/llvm/test/CodeGen/X86/add-and-not.ll b/llvm/test/CodeGen/X86/add-and-not.ll --- a/llvm/test/CodeGen/X86/add-and-not.ll +++ b/llvm/test/CodeGen/X86/add-and-not.ll @@ -92,16 +92,16 @@ ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edi, %r14d -; CHECK-NEXT: movl %r14d, %eax +; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: notb %al -; CHECK-NEXT: movzbl %al, %ebp -; CHECK-NEXT: movl %ebp, %edi +; CHECK-NEXT: movzbl %al, %r14d +; CHECK-NEXT: movl %r14d, %edi ; CHECK-NEXT: callq use@PLT -; CHECK-NEXT: andb %bl, %bpl -; CHECK-NEXT: movzbl %bpl, %edi +; CHECK-NEXT: andb %bl, %r14b +; CHECK-NEXT: movzbl %r14b, %edi ; CHECK-NEXT: callq use@PLT -; CHECK-NEXT: orb %r14b, %bl +; CHECK-NEXT: orb %bpl, %bl ; CHECK-NEXT: movl %ebx, %eax ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/llvm/test/CodeGen/X86/addcarry.ll @@ -316,20 +316,20 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: addq (%rsi), %rdx -; CHECK-NEXT: movq 8(%rsi), %r11 -; CHECK-NEXT: adcq $0, %r11 +; CHECK-NEXT: movq 8(%rsi), %rdi +; CHECK-NEXT: adcq $0, %rdi ; CHECK-NEXT: setb %r10b -; CHECK-NEXT: movzbl %r10b, %edi -; CHECK-NEXT: addq %rcx, %r11 -; CHECK-NEXT: adcq 16(%rsi), %rdi +; CHECK-NEXT: movzbl %r10b, %r10d +; CHECK-NEXT: addq %rcx, %rdi +; CHECK-NEXT: adcq 16(%rsi), %r10 ; CHECK-NEXT: setb %cl ; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: addq %r8, %rdi +; CHECK-NEXT: addq %r8, %r10 ; CHECK-NEXT: adcq 24(%rsi), %rcx ; CHECK-NEXT: addq %r9, %rcx ; CHECK-NEXT: movq %rdx, (%rax) -; CHECK-NEXT: movq %r11, 8(%rax) -; CHECK-NEXT: movq %rdi, 16(%rax) +; CHECK-NEXT: movq %rdi, 8(%rax) +; CHECK-NEXT: movq %r10, 16(%rax) ; CHECK-NEXT: movq %rcx, 24(%rax) ; CHECK-NEXT: retq entry: @@ -751,27 +751,27 @@ ; CHECK-NEXT: adcq %rdx, 8(%rdi) ; CHECK-NEXT: movq %rax, %rdx ; CHECK-NEXT: adcq %rcx, %rdx -; CHECK-NEXT: movq 24(%rdi), %r11 -; CHECK-NEXT: leaq (%r8,%r11), %r14 +; CHECK-NEXT: movq 24(%rdi), %rsi +; CHECK-NEXT: leaq (%r8,%rsi), %r11 ; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: cmpq %r10, %rdx ; CHECK-NEXT: setb %bl ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: adcq %r14, %rbx -; CHECK-NEXT: movq 32(%rdi), %r10 -; CHECK-NEXT: leaq (%r9,%r10), %rcx -; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: cmpq %r14, %rbx -; CHECK-NEXT: setb %sil -; CHECK-NEXT: addq %r11, %r8 -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: adcq %r11, %rbx +; CHECK-NEXT: movq 32(%rdi), %rcx +; CHECK-NEXT: leaq (%r9,%rcx), %r10 +; CHECK-NEXT: xorl %r14d, %r14d +; CHECK-NEXT: cmpq %r11, %rbx +; CHECK-NEXT: setb %r14b +; CHECK-NEXT: addq %rsi, %r8 +; CHECK-NEXT: adcq %r10, %r14 ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: cmpq %rcx, %rsi +; CHECK-NEXT: cmpq %r10, %r14 ; CHECK-NEXT: setb %al -; CHECK-NEXT: addq %r10, %r9 +; CHECK-NEXT: addq %rcx, %r9 ; CHECK-NEXT: movq %rdx, 16(%rdi) ; CHECK-NEXT: movq %rbx, 24(%rdi) -; CHECK-NEXT: movq %rsi, 32(%rdi) +; CHECK-NEXT: movq %r14, 32(%rdi) ; CHECK-NEXT: adcl $0, %eax ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 @@ -1219,18 +1219,18 @@ ; CHECK-LABEL: add_U256_without_i128_or_by_i64_words: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq (%rdx), %r8 +; CHECK-NEXT: movq (%rdx), %rcx ; CHECK-NEXT: movq 8(%rdx), %rdi -; CHECK-NEXT: addq (%rsi), %r8 +; CHECK-NEXT: addq (%rsi), %rcx ; CHECK-NEXT: adcq 8(%rsi), %rdi -; CHECK-NEXT: movq 16(%rdx), %rcx -; CHECK-NEXT: adcq 16(%rsi), %rcx +; CHECK-NEXT: movq 16(%rdx), %r8 +; CHECK-NEXT: adcq 16(%rsi), %r8 ; CHECK-NEXT: movq 24(%rdx), %rdx ; CHECK-NEXT: adcq 24(%rsi), %rdx ; CHECK-NEXT: movq %rdx, (%rax) -; CHECK-NEXT: movq %rcx, 8(%rax) +; CHECK-NEXT: movq %r8, 8(%rax) ; CHECK-NEXT: movq %rdi, 16(%rax) -; CHECK-NEXT: movq %r8, 24(%rax) +; CHECK-NEXT: movq %rcx, 24(%rax) ; CHECK-NEXT: retq %4 = load i64, ptr %1, align 8 %5 = load i64, ptr %2, align 8 @@ -1279,17 +1279,17 @@ ; CHECK-LABEL: add_U256_without_i128_or_recursive: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq (%rdx), %r8 +; CHECK-NEXT: movq (%rdx), %rcx ; CHECK-NEXT: movq 8(%rdx), %rdi -; CHECK-NEXT: addq (%rsi), %r8 +; CHECK-NEXT: addq (%rsi), %rcx ; CHECK-NEXT: adcq 8(%rsi), %rdi -; CHECK-NEXT: movq 16(%rdx), %rcx +; CHECK-NEXT: movq 16(%rdx), %r8 ; CHECK-NEXT: movq 24(%rdx), %rdx -; CHECK-NEXT: adcq 16(%rsi), %rcx +; CHECK-NEXT: adcq 16(%rsi), %r8 ; CHECK-NEXT: adcq 24(%rsi), %rdx -; CHECK-NEXT: movq %r8, (%rax) +; CHECK-NEXT: movq %rcx, (%rax) ; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq %rcx, 16(%rax) +; CHECK-NEXT: movq %r8, 16(%rax) ; CHECK-NEXT: movq %rdx, 24(%rax) ; CHECK-NEXT: retq %4 = load i64, ptr %1, align 8 diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -1771,56 +1771,52 @@ ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: addq %rsi, %rcx +; SSE2-NEXT: addq %rax, %rcx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: addq %rbp, %rax -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: leaq -1(%rdx,%rsi), %r11 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%rdi,%rdx), %rsi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%r8,%rdx), %rdi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%r9,%rdx), %r8 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%rbx,%rdx), %rbx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%r10,%rdx), %r9 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%r13,%rdx), %r13 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%r12,%rdx), %r12 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%r14,%rdx), %r14 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: leaq -1(%r15,%rdx), %r15 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rdx), %rdx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: leaq -1(%r13,%rbp), %r13 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: leaq -1(%r12,%rbp), %r12 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: leaq -1(%r15,%rbp), %r15 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: leaq -1(%r14,%rbp), %r14 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: leaq -1(%rbx,%rbp), %rbx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: leaq -1(%r11,%rbp), %r11 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: leaq -1(%r10,%rbp), %r10 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: leaq -1(%r9,%rbp), %r9 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: leaq -1(%r8,%rbp), %r8 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: leaq -1(%rdi,%rbp), %rdi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: leaq -1(%rsi,%rbp), %rsi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: leaq -1(%rdx,%rbp), %rdx ; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rdx), %r10 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload ; SSE2-NEXT: leaq -1(%rbp,%rdx), %rdx ; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx @@ -1835,74 +1831,74 @@ ; SSE2-NEXT: adcq $-1, %rbp ; SSE2-NEXT: shldq $63, %rax, %rbp ; SSE2-NEXT: shldq $63, %rcx, %rdx -; SSE2-NEXT: movq %rdx, %xmm8 +; SSE2-NEXT: movq %rdx, %xmm1 ; SSE2-NEXT: movq %rbp, %xmm0 -; SSE2-NEXT: shrq %r11 -; SSE2-NEXT: movq %r11, %xmm9 -; SSE2-NEXT: shrq %rsi -; SSE2-NEXT: movq %rsi, %xmm2 -; SSE2-NEXT: shrq %rdi -; SSE2-NEXT: movq %rdi, %xmm10 -; SSE2-NEXT: shrq %r8 -; SSE2-NEXT: movq %r8, %xmm4 -; SSE2-NEXT: shrq %rbx -; SSE2-NEXT: movq %rbx, %xmm11 -; SSE2-NEXT: shrq %r9 -; SSE2-NEXT: movq %r9, %xmm7 ; SSE2-NEXT: shrq %r13 -; SSE2-NEXT: movq %r13, %xmm12 +; SSE2-NEXT: movq %r13, %xmm3 ; SSE2-NEXT: shrq %r12 -; SSE2-NEXT: movq %r12, %xmm1 -; SSE2-NEXT: shrq %r14 -; SSE2-NEXT: movq %r14, %xmm13 +; SSE2-NEXT: movq %r12, %xmm2 ; SSE2-NEXT: shrq %r15 -; SSE2-NEXT: movq %r15, %xmm6 +; SSE2-NEXT: movq %r15, %xmm5 +; SSE2-NEXT: shrq %r14 +; SSE2-NEXT: movq %r14, %xmm4 +; SSE2-NEXT: shrq %rbx +; SSE2-NEXT: movq %rbx, %xmm6 +; SSE2-NEXT: shrq %r11 +; SSE2-NEXT: movq %r11, %xmm7 +; SSE2-NEXT: shrq %r10 +; SSE2-NEXT: movq %r10, %xmm9 +; SSE2-NEXT: shrq %r9 +; SSE2-NEXT: movq %r9, %xmm8 +; SSE2-NEXT: shrq %r8 +; SSE2-NEXT: movq %r8, %xmm10 +; SSE2-NEXT: shrq %rdi +; SSE2-NEXT: movq %rdi, %xmm11 +; SSE2-NEXT: shrq %rsi +; SSE2-NEXT: movq %rsi, %xmm12 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm14 -; SSE2-NEXT: shrq %r10 -; SSE2-NEXT: movq %r10, %xmm5 +; SSE2-NEXT: movq %rax, %xmm13 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm15 +; SSE2-NEXT: movq %rax, %xmm14 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: movq %rax, %xmm15 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; SSE2-NEXT: psllq $48, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; SSE2-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE2-NEXT: movupd %xmm1, (%rax) +; SSE2-NEXT: por %xmm8, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; SSE2-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,0,1] +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm13, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE2-NEXT: movupd %xmm2, (%rax) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -1927,118 +1923,116 @@ ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: vpextrw $5, %xmm0, %eax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $7, %xmm0, %r11d -; AVX1-NEXT: vpextrw $0, %xmm3, %r14d -; AVX1-NEXT: vpextrw $1, %xmm3, %r15d -; AVX1-NEXT: vpextrw $2, %xmm3, %r10d -; AVX1-NEXT: vpextrw $3, %xmm3, %r9d -; AVX1-NEXT: vpextrw $4, %xmm3, %r8d -; AVX1-NEXT: vpextrw $5, %xmm3, %ebx -; AVX1-NEXT: vpextrw $6, %xmm3, %ebp -; AVX1-NEXT: vpextrw $7, %xmm3, %edi -; AVX1-NEXT: vpextrw $1, %xmm0, %edx -; AVX1-NEXT: vpextrw $0, %xmm0, %esi +; AVX1-NEXT: vpextrw $6, %xmm0, %r10d +; AVX1-NEXT: vpextrw $7, %xmm0, %edx +; AVX1-NEXT: vpextrw $0, %xmm3, %edi +; AVX1-NEXT: vpextrw $1, %xmm3, %r8d +; AVX1-NEXT: vpextrw $2, %xmm3, %r9d +; AVX1-NEXT: vpextrw $3, %xmm3, %r11d +; AVX1-NEXT: vpextrw $4, %xmm3, %ebx +; AVX1-NEXT: vpextrw $5, %xmm3, %r14d +; AVX1-NEXT: vpextrw $6, %xmm3, %r15d +; AVX1-NEXT: vpextrw $7, %xmm3, %esi +; AVX1-NEXT: vpextrw $1, %xmm0, %r13d +; AVX1-NEXT: vpextrw $0, %xmm0, %r12d ; AVX1-NEXT: vpextrw $1, %xmm1, %ecx -; AVX1-NEXT: addq %rdx, %rcx +; AVX1-NEXT: addq %r13, %rcx ; AVX1-NEXT: vpextrw $0, %xmm1, %eax -; AVX1-NEXT: addq %rsi, %rax -; AVX1-NEXT: vpextrw $7, %xmm2, %edx -; AVX1-NEXT: leaq -1(%rdi,%rdx), %rdi -; AVX1-NEXT: vpextrw $6, %xmm2, %edx -; AVX1-NEXT: leaq -1(%rbp,%rdx), %rbp -; AVX1-NEXT: vpextrw $5, %xmm2, %edx -; AVX1-NEXT: leaq -1(%rbx,%rdx), %rbx -; AVX1-NEXT: vpextrw $4, %xmm2, %edx -; AVX1-NEXT: leaq -1(%r8,%rdx), %r8 -; AVX1-NEXT: vpextrw $3, %xmm2, %edx -; AVX1-NEXT: leaq -1(%r9,%rdx), %r9 -; AVX1-NEXT: vpextrw $2, %xmm2, %edx -; AVX1-NEXT: leaq -1(%r10,%rdx), %r10 -; AVX1-NEXT: vpextrw $1, %xmm2, %edx -; AVX1-NEXT: leaq -1(%r15,%rdx), %r13 -; AVX1-NEXT: vpextrw $0, %xmm2, %edx -; AVX1-NEXT: leaq -1(%r14,%rdx), %r12 -; AVX1-NEXT: vpextrw $7, %xmm1, %edx -; AVX1-NEXT: leaq -1(%r11,%rdx), %r15 +; AVX1-NEXT: addq %r12, %rax +; AVX1-NEXT: vpextrw $7, %xmm2, %r12d +; AVX1-NEXT: leaq -1(%rsi,%r12), %rsi +; AVX1-NEXT: vpextrw $6, %xmm2, %r12d +; AVX1-NEXT: leaq -1(%r15,%r12), %rbp +; AVX1-NEXT: vpextrw $5, %xmm2, %r15d +; AVX1-NEXT: leaq -1(%r14,%r15), %r13 +; AVX1-NEXT: vpextrw $4, %xmm2, %r14d +; AVX1-NEXT: leaq -1(%rbx,%r14), %r12 +; AVX1-NEXT: vpextrw $3, %xmm2, %ebx +; AVX1-NEXT: leaq -1(%r11,%rbx), %r15 +; AVX1-NEXT: vpextrw $2, %xmm2, %r11d +; AVX1-NEXT: leaq -1(%r9,%r11), %r14 +; AVX1-NEXT: vpextrw $1, %xmm2, %r9d +; AVX1-NEXT: leaq -1(%r8,%r9), %rbx +; AVX1-NEXT: vpextrw $0, %xmm2, %r8d +; AVX1-NEXT: leaq -1(%rdi,%r8), %r11 +; AVX1-NEXT: vpextrw $7, %xmm1, %edi +; AVX1-NEXT: leaq -1(%rdx,%rdi), %r9 ; AVX1-NEXT: vpextrw $6, %xmm1, %edx -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX1-NEXT: leaq -1(%rsi,%rdx), %r14 +; AVX1-NEXT: leaq -1(%r10,%rdx), %r8 ; AVX1-NEXT: vpextrw $5, %xmm1, %edx -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX1-NEXT: leaq -1(%rsi,%rdx), %r11 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX1-NEXT: leaq -1(%rdi,%rdx), %rdi ; AVX1-NEXT: vpextrw $4, %xmm1, %edx -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX1-NEXT: leaq -1(%rsi,%rdx), %rdx +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX1-NEXT: leaq -1(%r10,%rdx), %rdx ; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: vpextrw $3, %xmm0, %edx -; AVX1-NEXT: vpextrw $3, %xmm1, %esi -; AVX1-NEXT: leaq -1(%rdx,%rsi), %rdx +; AVX1-NEXT: vpextrw $3, %xmm1, %r10d +; AVX1-NEXT: leaq -1(%rdx,%r10), %rdx ; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: vpextrw $2, %xmm0, %edx -; AVX1-NEXT: vpextrw $2, %xmm1, %esi -; AVX1-NEXT: leaq -1(%rdx,%rsi), %rdx +; AVX1-NEXT: vpextrw $2, %xmm1, %r10d +; AVX1-NEXT: leaq -1(%rdx,%r10), %rdx ; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: xorl %edx, %edx ; AVX1-NEXT: addq $-1, %rcx -; AVX1-NEXT: movl $0, %esi -; AVX1-NEXT: adcq $-1, %rsi +; AVX1-NEXT: movl $0, %r10d +; AVX1-NEXT: adcq $-1, %r10 ; AVX1-NEXT: addq $-1, %rax ; AVX1-NEXT: adcq $-1, %rdx ; AVX1-NEXT: shldq $63, %rax, %rdx -; AVX1-NEXT: shldq $63, %rcx, %rsi -; AVX1-NEXT: shrq %rdi -; AVX1-NEXT: vmovq %rdi, %xmm8 +; AVX1-NEXT: shldq $63, %rcx, %r10 +; AVX1-NEXT: shrq %rsi +; AVX1-NEXT: vmovq %rsi, %xmm0 ; AVX1-NEXT: shrq %rbp -; AVX1-NEXT: vmovq %rbp, %xmm9 -; AVX1-NEXT: shrq %rbx -; AVX1-NEXT: vmovq %rbx, %xmm0 -; AVX1-NEXT: shrq %r8 -; AVX1-NEXT: vmovq %r8, %xmm1 -; AVX1-NEXT: shrq %r9 -; AVX1-NEXT: vmovq %r9, %xmm12 -; AVX1-NEXT: shrq %r10 -; AVX1-NEXT: vmovq %r10, %xmm13 +; AVX1-NEXT: vmovq %rbp, %xmm1 ; AVX1-NEXT: shrq %r13 -; AVX1-NEXT: vmovq %r13, %xmm14 +; AVX1-NEXT: vmovq %r13, %xmm2 ; AVX1-NEXT: shrq %r12 -; AVX1-NEXT: vmovq %r12, %xmm15 +; AVX1-NEXT: vmovq %r12, %xmm3 ; AVX1-NEXT: shrq %r15 -; AVX1-NEXT: vmovq %r15, %xmm10 +; AVX1-NEXT: vmovq %r15, %xmm4 ; AVX1-NEXT: shrq %r14 -; AVX1-NEXT: vmovq %r14, %xmm11 +; AVX1-NEXT: vmovq %r14, %xmm5 +; AVX1-NEXT: shrq %rbx +; AVX1-NEXT: vmovq %rbx, %xmm6 ; AVX1-NEXT: shrq %r11 -; AVX1-NEXT: vmovq %r11, %xmm2 +; AVX1-NEXT: vmovq %r11, %xmm7 +; AVX1-NEXT: shrq %r9 +; AVX1-NEXT: vmovq %r9, %xmm8 +; AVX1-NEXT: shrq %r8 +; AVX1-NEXT: vmovq %r8, %xmm9 +; AVX1-NEXT: shrq %rdi +; AVX1-NEXT: vmovq %rdi, %xmm10 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm3 -; AVX1-NEXT: vmovq %rsi, %xmm4 -; AVX1-NEXT: vmovq %rdx, %xmm5 +; AVX1-NEXT: vmovq %rax, %xmm11 +; AVX1-NEXT: vmovq %r10, %xmm12 +; AVX1-NEXT: vmovq %rdx, %xmm13 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm6 +; AVX1-NEXT: vmovq %rax, %xmm14 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm7 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,6],xmm8[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm8[6,7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vmovq %rax, %xmm15 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; AVX1-NEXT: vpslld $16, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] @@ -2064,7 +2058,7 @@ ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 @@ -2074,63 +2068,62 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX2-NEXT: vmovq %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vmovq %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vmovq %xmm7, %r13 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vmovq %xmm2, %r11 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %r14 -; AVX2-NEXT: vpextrq $1, %xmm0, %rbx -; AVX2-NEXT: vpextrq $1, %xmm2, %rsi -; AVX2-NEXT: vpextrq $1, %xmm7, %r12 -; AVX2-NEXT: vpextrq $1, %xmm6, %r15 +; AVX2-NEXT: vmovq %xmm2, %rbp +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-NEXT: vmovq %xmm8, %r8 +; AVX2-NEXT: vpextrq $1, %xmm8, %r15 +; AVX2-NEXT: vpextrq $1, %xmm2, %r14 +; AVX2-NEXT: vpextrq $1, %xmm7, %rbx +; AVX2-NEXT: vpextrq $1, %xmm6, %rsi ; AVX2-NEXT: vpextrq $1, %xmm5, %rdx ; AVX2-NEXT: vpextrq $1, %xmm1, %rcx ; AVX2-NEXT: vpextrq $1, %xmm3, %rax -; AVX2-NEXT: vmovq %xmm3, %rbp -; AVX2-NEXT: vpextrq $1, %xmm9, %r9 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vmovq %xmm3, %rdi +; AVX2-NEXT: vpextrq $1, %xmm0, %r10 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm2 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rdi -; AVX2-NEXT: addq %rbx, %rdi -; AVX2-NEXT: movq %rdi, %rbx -; AVX2-NEXT: vpextrq $1, %xmm8, %r10 -; AVX2-NEXT: addq %rsi, %r10 -; AVX2-NEXT: vpextrq $1, %xmm7, %rsi -; AVX2-NEXT: addq %r12, %rsi -; AVX2-NEXT: movq %rsi, %r12 -; AVX2-NEXT: vpextrq $1, %xmm4, %r13 -; AVX2-NEXT: addq %r15, %r13 -; AVX2-NEXT: vpextrq $1, %xmm5, %r15 -; AVX2-NEXT: addq %rdx, %r15 -; AVX2-NEXT: vpextrq $1, %xmm3, %r8 -; AVX2-NEXT: addq %rcx, %r8 +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-NEXT: vpextrq $1, %xmm9, %r11 +; AVX2-NEXT: addq %r15, %r11 +; AVX2-NEXT: vpextrq $1, %xmm8, %r9 +; AVX2-NEXT: addq %r14, %r9 +; AVX2-NEXT: movq %r9, %r14 +; AVX2-NEXT: vpextrq $1, %xmm7, %r9 +; AVX2-NEXT: addq %rbx, %r9 +; AVX2-NEXT: movq %r9, %rbx +; AVX2-NEXT: vpextrq $1, %xmm4, %r15 +; AVX2-NEXT: addq %rsi, %r15 +; AVX2-NEXT: vpextrq $1, %xmm5, %r12 +; AVX2-NEXT: addq %rdx, %r12 +; AVX2-NEXT: vpextrq $1, %xmm3, %r9 +; AVX2-NEXT: addq %rcx, %r9 ; AVX2-NEXT: vpextrq $1, %xmm6, %rsi ; AVX2-NEXT: addq %rax, %rsi ; AVX2-NEXT: vmovq %xmm6, %rdx -; AVX2-NEXT: addq %rbp, %rdx +; AVX2-NEXT: addq %rdi, %rdx ; AVX2-NEXT: vpextrq $1, %xmm2, %rcx -; AVX2-NEXT: addq %r9, %rcx -; AVX2-NEXT: vmovq %xmm0, %rdi -; AVX2-NEXT: leaq -1(%r14,%rdi), %rax +; AVX2-NEXT: addq %r10, %rcx +; AVX2-NEXT: vmovq %xmm9, %r10 +; AVX2-NEXT: leaq -1(%r8,%r10), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: vmovq %xmm8, %rdi -; AVX2-NEXT: leaq -1(%r11,%rdi), %rax +; AVX2-NEXT: leaq -1(%rbp,%rdi), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: vmovq %xmm7, %rdi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdi), %rax +; AVX2-NEXT: leaq -1(%r13,%rdi), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: vmovq %xmm4, %rdi ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload @@ -2141,37 +2134,37 @@ ; AVX2-NEXT: leaq -1(%rax,%rdi), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: vmovq %xmm1, %rdi -; AVX2-NEXT: vmovq %xmm3, %rbp -; AVX2-NEXT: leaq -1(%rdi,%rbp), %rax +; AVX2-NEXT: vmovq %xmm3, %r8 +; AVX2-NEXT: leaq -1(%rdi,%r8), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm9, %rdi -; AVX2-NEXT: vmovq %xmm2, %rbp -; AVX2-NEXT: leaq -1(%rdi,%rbp), %rdi +; AVX2-NEXT: vmovq %xmm0, %rdi +; AVX2-NEXT: vmovq %xmm2, %r8 +; AVX2-NEXT: leaq -1(%rdi,%r8), %rdi ; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: addq $-1, %rbx -; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %r9d -; AVX2-NEXT: adcq $-1, %r9 -; AVX2-NEXT: addq $-1, %r10 -; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %r11 +; AVX2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %r8d +; AVX2-NEXT: adcq $-1, %r8 +; AVX2-NEXT: addq $-1, %r14 +; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: movl $0, %edi ; AVX2-NEXT: adcq $-1, %rdi -; AVX2-NEXT: addq $-1, %r12 -; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %rbx +; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: movl $0, %r11d ; AVX2-NEXT: adcq $-1, %r11 -; AVX2-NEXT: addq $-1, %r13 +; AVX2-NEXT: addq $-1, %r15 ; AVX2-NEXT: movl $0, %r10d ; AVX2-NEXT: adcq $-1, %r10 -; AVX2-NEXT: addq $-1, %r15 +; AVX2-NEXT: addq $-1, %r12 ; AVX2-NEXT: movl $0, %r14d ; AVX2-NEXT: adcq $-1, %r14 -; AVX2-NEXT: addq $-1, %r8 +; AVX2-NEXT: addq $-1, %r9 ; AVX2-NEXT: movl $0, %ebp ; AVX2-NEXT: adcq $-1, %rbp ; AVX2-NEXT: addq $-1, %rsi -; AVX2-NEXT: movl $0, %r12d -; AVX2-NEXT: adcq $-1, %r12 +; AVX2-NEXT: movl $0, %r13d +; AVX2-NEXT: adcq $-1, %r13 ; AVX2-NEXT: addq $-1, %rdx ; AVX2-NEXT: movl $0, %ebx ; AVX2-NEXT: adcq $-1, %rbx @@ -2180,64 +2173,64 @@ ; AVX2-NEXT: adcq $-1, %rax ; AVX2-NEXT: shldq $63, %rcx, %rax ; AVX2-NEXT: shldq $63, %rdx, %rbx -; AVX2-NEXT: shldq $63, %rsi, %r12 -; AVX2-NEXT: shldq $63, %r8, %rbp -; AVX2-NEXT: shldq $63, %r15, %r14 -; AVX2-NEXT: shldq $63, %r13, %r10 +; AVX2-NEXT: shldq $63, %rsi, %r13 +; AVX2-NEXT: shldq $63, %r9, %rbp +; AVX2-NEXT: shldq $63, %r12, %r14 +; AVX2-NEXT: shldq $63, %r15, %r10 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shldq $63, %rcx, %r11 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shldq $63, %rcx, %rdi ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %r9 -; AVX2-NEXT: vmovq %r9, %xmm8 +; AVX2-NEXT: shldq $63, %rcx, %r8 +; AVX2-NEXT: vmovq %r8, %xmm0 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm9 -; AVX2-NEXT: vmovq %rdi, %xmm0 +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vmovq %rdi, %xmm2 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vmovq %r11, %xmm12 +; AVX2-NEXT: vmovq %rcx, %xmm3 +; AVX2-NEXT: vmovq %r11, %xmm4 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm13 -; AVX2-NEXT: vmovq %r10, %xmm14 +; AVX2-NEXT: vmovq %rcx, %xmm5 +; AVX2-NEXT: vmovq %r10, %xmm6 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm15 -; AVX2-NEXT: vmovq %r14, %xmm10 +; AVX2-NEXT: vmovq %rcx, %xmm7 +; AVX2-NEXT: vmovq %r14, %xmm8 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm11 -; AVX2-NEXT: vmovq %rbp, %xmm2 +; AVX2-NEXT: vmovq %rcx, %xmm9 +; AVX2-NEXT: vmovq %rbp, %xmm10 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm3 -; AVX2-NEXT: vmovq %r12, %xmm4 -; AVX2-NEXT: vmovq %rbx, %xmm5 -; AVX2-NEXT: vmovq %rax, %xmm6 +; AVX2-NEXT: vmovq %rcx, %xmm11 +; AVX2-NEXT: vmovq %r13, %xmm12 +; AVX2-NEXT: vmovq %rbx, %xmm13 +; AVX2-NEXT: vmovq %rax, %xmm14 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX2-NEXT: shrq %rax -; AVX2-NEXT: vmovq %rax, %xmm7 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-NEXT: vpbroadcastw %xmm8, %xmm8 -; AVX2-NEXT: vpbroadcastw %xmm9, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-NEXT: vmovq %rax, %xmm15 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm9, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5],xmm2[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; AVX2-NEXT: vpslld $16, %xmm3, %xmm3 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] @@ -2266,25 +2259,25 @@ ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vmovq %xmm4, %rbx -; AVX512-NEXT: vpextrq $1, %xmm4, %rbp -; AVX512-NEXT: vmovq %xmm3, %rdi -; AVX512-NEXT: vpextrq $1, %xmm3, %rsi +; AVX512-NEXT: vmovq %xmm4, %r13 +; AVX512-NEXT: vpextrq $1, %xmm4, %r12 +; AVX512-NEXT: vmovq %xmm3, %r15 +; AVX512-NEXT: vpextrq $1, %xmm3, %r14 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512-NEXT: vmovq %xmm2, %rdx -; AVX512-NEXT: vpextrq $1, %xmm2, %r10 +; AVX512-NEXT: vmovq %xmm2, %rbx +; AVX512-NEXT: vpextrq $1, %xmm2, %r11 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-NEXT: vmovq %xmm2, %r13 -; AVX512-NEXT: vpextrq $1, %xmm2, %r14 +; AVX512-NEXT: vmovq %xmm2, %r10 +; AVX512-NEXT: vpextrq $1, %xmm2, %rax ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512-NEXT: vmovq %xmm2, %r15 +; AVX512-NEXT: vmovq %xmm2, %rdi ; AVX512-NEXT: vpextrq $1, %xmm2, %r8 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-NEXT: vmovq %xmm2, %r11 -; AVX512-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: vmovq %xmm2, %rsi +; AVX512-NEXT: vpextrq $1, %xmm2, %rdx ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill @@ -2293,50 +2286,48 @@ ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vmovq %xmm4, %rax -; AVX512-NEXT: addq %rbx, %rax -; AVX512-NEXT: movq %rax, %rbx -; AVX512-NEXT: vpextrq $1, %xmm4, %rax -; AVX512-NEXT: addq %rbp, %rax -; AVX512-NEXT: movq %rax, %r9 +; AVX512-NEXT: vmovq %xmm4, %rbp +; AVX512-NEXT: addq %r13, %rbp +; AVX512-NEXT: vpextrq $1, %xmm4, %r13 +; AVX512-NEXT: addq %r12, %r13 ; AVX512-NEXT: vmovq %xmm3, %rcx -; AVX512-NEXT: addq %rdi, %rcx -; AVX512-NEXT: vpextrq $1, %xmm3, %rax -; AVX512-NEXT: addq %rsi, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addq %r15, %rcx +; AVX512-NEXT: vpextrq $1, %xmm3, %r9 +; AVX512-NEXT: addq %r14, %r9 +; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512-NEXT: vmovq %xmm2, %rbp -; AVX512-NEXT: addq %rdx, %rbp -; AVX512-NEXT: vpextrq $1, %xmm2, %rax -; AVX512-NEXT: addq %r10, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vmovq %xmm2, %r9 +; AVX512-NEXT: addq %rbx, %r9 +; AVX512-NEXT: vpextrq $1, %xmm2, %rbx +; AVX512-NEXT: addq %r11, %rbx +; AVX512-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-NEXT: vmovq %xmm2, %rax -; AVX512-NEXT: addq %r13, %rax -; AVX512-NEXT: movq %rax, %r13 -; AVX512-NEXT: vpextrq $1, %xmm2, %rax -; AVX512-NEXT: addq %r14, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vmovq %xmm2, %r11 +; AVX512-NEXT: addq %r10, %r11 +; AVX512-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vpextrq $1, %xmm2, %r10 +; AVX512-NEXT: addq %rax, %r10 +; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512-NEXT: vmovq %xmm2, %rax -; AVX512-NEXT: addq %r15, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addq %rdi, %rax +; AVX512-NEXT: movq %rax, %r12 ; AVX512-NEXT: vpextrq $1, %xmm2, %rax ; AVX512-NEXT: addq %r8, %rax ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512-NEXT: vmovq %xmm2, %rax -; AVX512-NEXT: addq %r11, %rax +; AVX512-NEXT: addq %rsi, %rax ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vpextrq $1, %xmm2, %r12 -; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; AVX512-NEXT: vpextrq $1, %xmm2, %r15 +; AVX512-NEXT: addq %rdx, %r15 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vmovq %xmm0, %r11 -; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; AVX512-NEXT: vmovq %xmm0, %r10 +; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload ; AVX512-NEXT: vpextrq $1, %xmm0, %r8 ; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload ; AVX512-NEXT: vmovq %xmm1, %rax @@ -2346,34 +2337,29 @@ ; AVX512-NEXT: vpextrq $1, %xmm1, %rsi ; AVX512-NEXT: vpextrq $1, %xmm0, %rdx ; AVX512-NEXT: addq %rsi, %rdx -; AVX512-NEXT: addq $-1, %rbx -; AVX512-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movl $0, %r15d -; AVX512-NEXT: adcq $-1, %r15 -; AVX512-NEXT: addq $-1, %r9 -; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addq $-1, %rbp +; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: movl $0, %r14d ; AVX512-NEXT: adcq $-1, %r14 +; AVX512-NEXT: addq $-1, %r13 +; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movl $0, %ebx +; AVX512-NEXT: adcq $-1, %rbx ; AVX512-NEXT: addq $-1, %rcx -; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq %rcx, (%rsp) # 8-byte Spill ; AVX512-NEXT: movl $0, %esi ; AVX512-NEXT: adcq $-1, %rsi ; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movl $0, %r10d -; AVX512-NEXT: adcq $-1, %r10 -; AVX512-NEXT: addq $-1, %rbp -; AVX512-NEXT: movq %rbp, (%rsp) # 8-byte Spill +; AVX512-NEXT: movl $0, %r11d +; AVX512-NEXT: adcq $-1, %r11 +; AVX512-NEXT: addq $-1, %r9 +; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: movl $0, %r9d ; AVX512-NEXT: adcq $-1, %r9 ; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX512-NEXT: movl $0, %eax ; AVX512-NEXT: adcq $-1, %rax ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: addq $-1, %r13 -; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movl $0, %eax -; AVX512-NEXT: adcq $-1, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX512-NEXT: movl $0, %eax ; AVX512-NEXT: adcq $-1, %rax @@ -2382,70 +2368,73 @@ ; AVX512-NEXT: movl $0, %eax ; AVX512-NEXT: adcq $-1, %rax ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addq $-1, %r12 +; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movl $0, %ebp +; AVX512-NEXT: adcq $-1, %rbp ; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX512-NEXT: movl $0, %eax ; AVX512-NEXT: adcq $-1, %rax ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: addq $-1, %rcx +; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX512-NEXT: movl $0, %eax ; AVX512-NEXT: adcq $-1, %rax ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: addq $-1, %r12 +; AVX512-NEXT: addq $-1, %r15 ; AVX512-NEXT: movl $0, %eax ; AVX512-NEXT: adcq $-1, %rax ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: addq $-1, %r11 -; AVX512-NEXT: movl $0, %r13d -; AVX512-NEXT: adcq $-1, %r13 +; AVX512-NEXT: addq $-1, %r10 +; AVX512-NEXT: movl $0, %r12d +; AVX512-NEXT: adcq $-1, %r12 ; AVX512-NEXT: addq $-1, %r8 +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: adcq $-1, %rcx +; AVX512-NEXT: addq $-1, %rdi ; AVX512-NEXT: movl $0, %eax ; AVX512-NEXT: adcq $-1, %rax -; AVX512-NEXT: addq $-1, %rdi -; AVX512-NEXT: movl $0, %ebx -; AVX512-NEXT: adcq $-1, %rbx ; AVX512-NEXT: addq $-1, %rdx -; AVX512-NEXT: movl $0, %ebp -; AVX512-NEXT: adcq $-1, %rbp -; AVX512-NEXT: shldq $63, %rdx, %rbp -; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq $63, %rdi, %rbx -; AVX512-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq $63, %r8, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: shldq $63, %r11, %r13 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: shldq $63, %r12, %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; AVX512-NEXT: shldq $63, %rcx, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX512-NEXT: movl $0, %r13d +; AVX512-NEXT: adcq $-1, %r13 +; AVX512-NEXT: shldq $63, %rdx, %r13 +; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq $63, %rdi, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq $63, %r8, %rcx +; AVX512-NEXT: movq %rcx, %r13 +; AVX512-NEXT: shldq $63, %r10, %r12 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: shldq $63, %r15, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shldq $63, %rax, %rbp +; AVX512-NEXT: shldq $63, %rax, %rdi +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: shldq $63, %rax, %rbx +; AVX512-NEXT: shldq $63, %rax, %r10 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: shldq $63, %rax, %rdi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX512-NEXT: shldq $63, %rax, %rbp ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX512-NEXT: shldq $63, %rax, %rdx ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq $63, %rax, %r15 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX512-NEXT: shldq $63, %rax, %rcx -; AVX512-NEXT: movq (%rsp), %rax # 8-byte Reload -; AVX512-NEXT: shldq $63, %rax, %r9 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shldq $63, %rax, %r10 +; AVX512-NEXT: shldq $63, %rax, %r9 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: shldq $63, %rax, %r11 +; AVX512-NEXT: movq (%rsp), %rax # 8-byte Reload ; AVX512-NEXT: shldq $63, %rax, %rsi ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shldq $63, %rax, %r14 +; AVX512-NEXT: shldq $63, %rax, %rbx ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shldq $63, %rax, %r15 -; AVX512-NEXT: vmovq %r15, %xmm0 -; AVX512-NEXT: vmovq %r14, %xmm1 -; AVX512-NEXT: vmovq %r10, %xmm2 +; AVX512-NEXT: shldq $63, %rax, %r14 +; AVX512-NEXT: vmovq %r14, %xmm0 +; AVX512-NEXT: vmovq %rbx, %xmm1 +; AVX512-NEXT: vmovq %r11, %xmm2 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovq %rsi, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 @@ -2467,8 +2456,8 @@ ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vmovd %xmm2, %eax ; AVX512-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %rdx, %xmm2 -; AVX512-NEXT: vmovq %rdi, %xmm3 +; AVX512-NEXT: vmovq %r15, %xmm2 +; AVX512-NEXT: vmovq %rdx, %xmm3 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 @@ -2477,16 +2466,15 @@ ; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; AVX512-NEXT: vmovd %xmm1, %eax ; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX512-NEXT: movq %rbx, %rax -; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %rbx, %xmm1 -; AVX512-NEXT: vmovq %rbp, %xmm2 +; AVX512-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %rbp, %xmm1 +; AVX512-NEXT: vmovq %r10, %xmm2 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vmovd %xmm2, %eax ; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %r12, %xmm2 -; AVX512-NEXT: vmovq %r11, %xmm3 +; AVX512-NEXT: vmovq %rdi, %xmm2 +; AVX512-NEXT: vmovq %r8, %xmm3 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 @@ -2495,9 +2483,9 @@ ; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; AVX512-NEXT: vmovd %xmm1, %eax ; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrb $12, %r13d, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %r13, %xmm1 -; AVX512-NEXT: vmovq %r8, %xmm2 +; AVX512-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %r12, %xmm1 +; AVX512-NEXT: vmovq %r13, %xmm2 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vmovd %xmm2, %eax diff --git a/llvm/test/CodeGen/X86/avoid-sfb.ll b/llvm/test/CodeGen/X86/avoid-sfb.ll --- a/llvm/test/CodeGen/X86/avoid-sfb.ll +++ b/llvm/test/CodeGen/X86/avoid-sfb.ll @@ -806,21 +806,21 @@ ; CHECK-NEXT: movq %r8, %r15 ; CHECK-NEXT: movq %rcx, %r14 ; CHECK-NEXT: movl %edx, %ebp -; CHECK-NEXT: movq %rsi, %r12 -; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: movq %rdi, %r12 ; CHECK-NEXT: movl %r9d, 12(%rdi) ; CHECK-NEXT: callq bar@PLT ; CHECK-NEXT: cmpl $18, %ebp ; CHECK-NEXT: jl .LBB9_2 ; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movl %ebp, 4(%rbx) -; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: movl %ebp, 4(%r12) +; CHECK-NEXT: movq %r12, %rdi ; CHECK-NEXT: callq bar@PLT ; CHECK-NEXT: .LBB9_2: # %if.end ; CHECK-NEXT: movups (%r15), %xmm0 ; CHECK-NEXT: movups %xmm0, (%r14) -; CHECK-NEXT: movups (%rbx), %xmm0 -; CHECK-NEXT: movups %xmm0, (%r12) +; CHECK-NEXT: movups (%r12), %xmm0 +; CHECK-NEXT: movups %xmm0, (%rbx) ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 40 ; CHECK-NEXT: popq %r12 @@ -853,21 +853,21 @@ ; DISABLED-NEXT: movq %r8, %r15 ; DISABLED-NEXT: movq %rcx, %r14 ; DISABLED-NEXT: movl %edx, %ebp -; DISABLED-NEXT: movq %rsi, %r12 -; DISABLED-NEXT: movq %rdi, %rbx +; DISABLED-NEXT: movq %rsi, %rbx +; DISABLED-NEXT: movq %rdi, %r12 ; DISABLED-NEXT: movl %r9d, 12(%rdi) ; DISABLED-NEXT: callq bar@PLT ; DISABLED-NEXT: cmpl $18, %ebp ; DISABLED-NEXT: jl .LBB9_2 ; DISABLED-NEXT: # %bb.1: # %if.then -; DISABLED-NEXT: movl %ebp, 4(%rbx) -; DISABLED-NEXT: movq %rbx, %rdi +; DISABLED-NEXT: movl %ebp, 4(%r12) +; DISABLED-NEXT: movq %r12, %rdi ; DISABLED-NEXT: callq bar@PLT ; DISABLED-NEXT: .LBB9_2: # %if.end ; DISABLED-NEXT: movups (%r15), %xmm0 ; DISABLED-NEXT: movups %xmm0, (%r14) -; DISABLED-NEXT: movups (%rbx), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%r12) +; DISABLED-NEXT: movups (%r12), %xmm0 +; DISABLED-NEXT: movups %xmm0, (%rbx) ; DISABLED-NEXT: popq %rbx ; DISABLED-NEXT: .cfi_def_cfa_offset 40 ; DISABLED-NEXT: popq %r12 @@ -900,21 +900,21 @@ ; CHECK-AVX2-NEXT: movq %r8, %r15 ; CHECK-AVX2-NEXT: movq %rcx, %r14 ; CHECK-AVX2-NEXT: movl %edx, %ebp -; CHECK-AVX2-NEXT: movq %rsi, %r12 -; CHECK-AVX2-NEXT: movq %rdi, %rbx +; CHECK-AVX2-NEXT: movq %rsi, %rbx +; CHECK-AVX2-NEXT: movq %rdi, %r12 ; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi) ; CHECK-AVX2-NEXT: callq bar@PLT ; CHECK-AVX2-NEXT: cmpl $18, %ebp ; CHECK-AVX2-NEXT: jl .LBB9_2 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then -; CHECK-AVX2-NEXT: movl %ebp, 4(%rbx) -; CHECK-AVX2-NEXT: movq %rbx, %rdi +; CHECK-AVX2-NEXT: movl %ebp, 4(%r12) +; CHECK-AVX2-NEXT: movq %r12, %rdi ; CHECK-AVX2-NEXT: callq bar@PLT ; CHECK-AVX2-NEXT: .LBB9_2: # %if.end ; CHECK-AVX2-NEXT: vmovups (%r15), %xmm0 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%r14) -; CHECK-AVX2-NEXT: vmovups (%rbx), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%r12) +; CHECK-AVX2-NEXT: vmovups (%r12), %xmm0 +; CHECK-AVX2-NEXT: vmovups %xmm0, (%rbx) ; CHECK-AVX2-NEXT: popq %rbx ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 40 ; CHECK-AVX2-NEXT: popq %r12 @@ -947,21 +947,21 @@ ; CHECK-AVX512-NEXT: movq %r8, %r15 ; CHECK-AVX512-NEXT: movq %rcx, %r14 ; CHECK-AVX512-NEXT: movl %edx, %ebp -; CHECK-AVX512-NEXT: movq %rsi, %r12 -; CHECK-AVX512-NEXT: movq %rdi, %rbx +; CHECK-AVX512-NEXT: movq %rsi, %rbx +; CHECK-AVX512-NEXT: movq %rdi, %r12 ; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi) ; CHECK-AVX512-NEXT: callq bar@PLT ; CHECK-AVX512-NEXT: cmpl $18, %ebp ; CHECK-AVX512-NEXT: jl .LBB9_2 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then -; CHECK-AVX512-NEXT: movl %ebp, 4(%rbx) -; CHECK-AVX512-NEXT: movq %rbx, %rdi +; CHECK-AVX512-NEXT: movl %ebp, 4(%r12) +; CHECK-AVX512-NEXT: movq %r12, %rdi ; CHECK-AVX512-NEXT: callq bar@PLT ; CHECK-AVX512-NEXT: .LBB9_2: # %if.end ; CHECK-AVX512-NEXT: vmovups (%r15), %xmm0 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%r14) -; CHECK-AVX512-NEXT: vmovups (%rbx), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%r12) +; CHECK-AVX512-NEXT: vmovups (%r12), %xmm0 +; CHECK-AVX512-NEXT: vmovups %xmm0, (%rbx) ; CHECK-AVX512-NEXT: popq %rbx ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 40 ; CHECK-AVX512-NEXT: popq %r12 @@ -1012,24 +1012,24 @@ ; CHECK-NEXT: .cfi_offset %r15, -16 ; CHECK-NEXT: movq %r8, %r12 ; CHECK-NEXT: movq %rcx, %r15 -; CHECK-NEXT: movq %rsi, %r14 -; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: movq %rdi, %r14 ; CHECK-NEXT: movl %r9d, 12(%rdi) ; CHECK-NEXT: cmpl $18, %edx ; CHECK-NEXT: jl .LBB10_2 ; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movl %edx, 4(%rbx) -; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: movl %edx, 4(%r14) +; CHECK-NEXT: movq %r14, %rdi ; CHECK-NEXT: callq bar@PLT ; CHECK-NEXT: .LBB10_2: # %if.end ; CHECK-NEXT: movups (%r12), %xmm0 ; CHECK-NEXT: movups %xmm0, (%r15) -; CHECK-NEXT: movq (%rbx), %rax -; CHECK-NEXT: movq %rax, (%r14) -; CHECK-NEXT: movl 8(%rbx), %eax -; CHECK-NEXT: movl %eax, 8(%r14) -; CHECK-NEXT: movl 12(%rbx), %eax -; CHECK-NEXT: movl %eax, 12(%r14) +; CHECK-NEXT: movq (%r14), %rax +; CHECK-NEXT: movq %rax, (%rbx) +; CHECK-NEXT: movl 8(%r14), %eax +; CHECK-NEXT: movl %eax, 8(%rbx) +; CHECK-NEXT: movl 12(%r14), %eax +; CHECK-NEXT: movl %eax, 12(%rbx) ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 40 ; CHECK-NEXT: popq %rbx @@ -1060,20 +1060,20 @@ ; DISABLED-NEXT: .cfi_offset %r15, -16 ; DISABLED-NEXT: movq %r8, %r15 ; DISABLED-NEXT: movq %rcx, %r14 -; DISABLED-NEXT: movq %rsi, %r12 -; DISABLED-NEXT: movq %rdi, %rbx +; DISABLED-NEXT: movq %rsi, %rbx +; DISABLED-NEXT: movq %rdi, %r12 ; DISABLED-NEXT: movl %r9d, 12(%rdi) ; DISABLED-NEXT: cmpl $18, %edx ; DISABLED-NEXT: jl .LBB10_2 ; DISABLED-NEXT: # %bb.1: # %if.then -; DISABLED-NEXT: movl %edx, 4(%rbx) -; DISABLED-NEXT: movq %rbx, %rdi +; DISABLED-NEXT: movl %edx, 4(%r12) +; DISABLED-NEXT: movq %r12, %rdi ; DISABLED-NEXT: callq bar@PLT ; DISABLED-NEXT: .LBB10_2: # %if.end ; DISABLED-NEXT: movups (%r15), %xmm0 ; DISABLED-NEXT: movups %xmm0, (%r14) -; DISABLED-NEXT: movups (%rbx), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%r12) +; DISABLED-NEXT: movups (%r12), %xmm0 +; DISABLED-NEXT: movups %xmm0, (%rbx) ; DISABLED-NEXT: addq $8, %rsp ; DISABLED-NEXT: .cfi_def_cfa_offset 40 ; DISABLED-NEXT: popq %rbx @@ -1104,24 +1104,24 @@ ; CHECK-AVX2-NEXT: .cfi_offset %r15, -16 ; CHECK-AVX2-NEXT: movq %r8, %r12 ; CHECK-AVX2-NEXT: movq %rcx, %r15 -; CHECK-AVX2-NEXT: movq %rsi, %r14 -; CHECK-AVX2-NEXT: movq %rdi, %rbx +; CHECK-AVX2-NEXT: movq %rsi, %rbx +; CHECK-AVX2-NEXT: movq %rdi, %r14 ; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi) ; CHECK-AVX2-NEXT: cmpl $18, %edx ; CHECK-AVX2-NEXT: jl .LBB10_2 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then -; CHECK-AVX2-NEXT: movl %edx, 4(%rbx) -; CHECK-AVX2-NEXT: movq %rbx, %rdi +; CHECK-AVX2-NEXT: movl %edx, 4(%r14) +; CHECK-AVX2-NEXT: movq %r14, %rdi ; CHECK-AVX2-NEXT: callq bar@PLT ; CHECK-AVX2-NEXT: .LBB10_2: # %if.end ; CHECK-AVX2-NEXT: vmovups (%r12), %xmm0 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%r15) -; CHECK-AVX2-NEXT: movq (%rbx), %rax -; CHECK-AVX2-NEXT: movq %rax, (%r14) -; CHECK-AVX2-NEXT: movl 8(%rbx), %eax -; CHECK-AVX2-NEXT: movl %eax, 8(%r14) -; CHECK-AVX2-NEXT: movl 12(%rbx), %eax -; CHECK-AVX2-NEXT: movl %eax, 12(%r14) +; CHECK-AVX2-NEXT: movq (%r14), %rax +; CHECK-AVX2-NEXT: movq %rax, (%rbx) +; CHECK-AVX2-NEXT: movl 8(%r14), %eax +; CHECK-AVX2-NEXT: movl %eax, 8(%rbx) +; CHECK-AVX2-NEXT: movl 12(%r14), %eax +; CHECK-AVX2-NEXT: movl %eax, 12(%rbx) ; CHECK-AVX2-NEXT: addq $8, %rsp ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 40 ; CHECK-AVX2-NEXT: popq %rbx @@ -1152,24 +1152,24 @@ ; CHECK-AVX512-NEXT: .cfi_offset %r15, -16 ; CHECK-AVX512-NEXT: movq %r8, %r12 ; CHECK-AVX512-NEXT: movq %rcx, %r15 -; CHECK-AVX512-NEXT: movq %rsi, %r14 -; CHECK-AVX512-NEXT: movq %rdi, %rbx +; CHECK-AVX512-NEXT: movq %rsi, %rbx +; CHECK-AVX512-NEXT: movq %rdi, %r14 ; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi) ; CHECK-AVX512-NEXT: cmpl $18, %edx ; CHECK-AVX512-NEXT: jl .LBB10_2 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then -; CHECK-AVX512-NEXT: movl %edx, 4(%rbx) -; CHECK-AVX512-NEXT: movq %rbx, %rdi +; CHECK-AVX512-NEXT: movl %edx, 4(%r14) +; CHECK-AVX512-NEXT: movq %r14, %rdi ; CHECK-AVX512-NEXT: callq bar@PLT ; CHECK-AVX512-NEXT: .LBB10_2: # %if.end ; CHECK-AVX512-NEXT: vmovups (%r12), %xmm0 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%r15) -; CHECK-AVX512-NEXT: movq (%rbx), %rax -; CHECK-AVX512-NEXT: movq %rax, (%r14) -; CHECK-AVX512-NEXT: movl 8(%rbx), %eax -; CHECK-AVX512-NEXT: movl %eax, 8(%r14) -; CHECK-AVX512-NEXT: movl 12(%rbx), %eax -; CHECK-AVX512-NEXT: movl %eax, 12(%r14) +; CHECK-AVX512-NEXT: movq (%r14), %rax +; CHECK-AVX512-NEXT: movq %rax, (%rbx) +; CHECK-AVX512-NEXT: movl 8(%r14), %eax +; CHECK-AVX512-NEXT: movl %eax, 8(%rbx) +; CHECK-AVX512-NEXT: movl 12(%r14), %eax +; CHECK-AVX512-NEXT: movl %eax, 12(%rbx) ; CHECK-AVX512-NEXT: addq $8, %rsp ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 40 ; CHECK-AVX512-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -1486,10 +1486,10 @@ ; ; X64-LABEL: test_mm256_set_epi8: ; X64: # %bb.0: -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vmovd %eax, %xmm0 -; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: vmovd %r10d, %xmm0 +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax @@ -2102,10 +2102,10 @@ ; ; X64-LABEL: test_mm256_setr_epi8: ; X64: # %bb.0: -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vmovd %eax, %xmm0 -; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: vmovd %r10d, %xmm0 +; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax diff --git a/llvm/test/CodeGen/X86/avx-load-store.ll b/llvm/test/CodeGen/X86/avx-load-store.ll --- a/llvm/test/CodeGen/X86/avx-load-store.ll +++ b/llvm/test/CodeGen/X86/avx-load-store.ll @@ -9,9 +9,9 @@ ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $96, %rsp -; CHECK-NEXT: movq %rdx, %r14 -; CHECK-NEXT: movq %rsi, %r15 -; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: movq %rsi, %r14 +; CHECK-NEXT: movq %rdi, %r15 ; CHECK-NEXT: vmovaps (%rdi), %ymm0 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps (%rsi), %ymm1 @@ -20,11 +20,11 @@ ; CHECK-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; CHECK-NEXT: callq dummy@PLT ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vmovaps %ymm0, (%rbx) -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: vmovaps %ymm0, (%r15) -; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: vmovaps %ymm0, (%r14) +; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovaps %ymm0, (%rbx) ; CHECK-NEXT: addq $96, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -435,17 +435,17 @@ ; ALL_X64-NEXT: .cfi_offset %rbx, -32 ; ALL_X64-NEXT: .cfi_offset %r14, -24 ; ALL_X64-NEXT: .cfi_offset %rbp, -16 -; ALL_X64-NEXT: movl %esi, %r14d +; ALL_X64-NEXT: movl %esi, %ebx ; ALL_X64-NEXT: movl %edi, %ebp ; ALL_X64-NEXT: movl %edx, %esi ; ALL_X64-NEXT: callq _test11 -; ALL_X64-NEXT: movzbl %al, %ebx +; ALL_X64-NEXT: movzbl %al, %r14d ; ALL_X64-NEXT: movl %ebp, %edi -; ALL_X64-NEXT: movl %r14d, %esi -; ALL_X64-NEXT: movl %ebx, %edx +; ALL_X64-NEXT: movl %ebx, %esi +; ALL_X64-NEXT: movl %r14d, %edx ; ALL_X64-NEXT: callq _test10 ; ALL_X64-NEXT: xorl %ecx, %ecx -; ALL_X64-NEXT: testb $1, %bl +; ALL_X64-NEXT: testb $1, %r14b ; ALL_X64-NEXT: cmovel %ecx, %eax ; ALL_X64-NEXT: popq %rbx ; ALL_X64-NEXT: popq %r14 @@ -497,17 +497,17 @@ ; FASTISEL-NEXT: .cfi_offset %rbx, -32 ; FASTISEL-NEXT: .cfi_offset %r14, -24 ; FASTISEL-NEXT: .cfi_offset %rbp, -16 -; FASTISEL-NEXT: movl %esi, %r14d +; FASTISEL-NEXT: movl %esi, %ebx ; FASTISEL-NEXT: movl %edi, %ebp ; FASTISEL-NEXT: movl %edx, %esi ; FASTISEL-NEXT: callq _test11 -; FASTISEL-NEXT: movzbl %al, %ebx +; FASTISEL-NEXT: movzbl %al, %r14d ; FASTISEL-NEXT: movl %ebp, %edi -; FASTISEL-NEXT: movl %r14d, %esi -; FASTISEL-NEXT: movl %ebx, %edx +; FASTISEL-NEXT: movl %ebx, %esi +; FASTISEL-NEXT: movl %r14d, %edx ; FASTISEL-NEXT: callq _test10 ; FASTISEL-NEXT: xorl %ecx, %ecx -; FASTISEL-NEXT: testb $1, %bl +; FASTISEL-NEXT: testb $1, %r14b ; FASTISEL-NEXT: cmovel %ecx, %eax ; FASTISEL-NEXT: popq %rbx ; FASTISEL-NEXT: popq %r14 @@ -910,84 +910,84 @@ ; KNL-NEXT: kandw %k2, %k0, %k0 ; KNL-NEXT: kmovw %r10d, %k2 ; KNL-NEXT: kandw %k1, %k2, %k1 -; KNL-NEXT: kmovw %k1, %r8d +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftrw $1, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftrw $2, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r10d +; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftrw $3, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r11d +; KNL-NEXT: kmovw %k1, %edi ; KNL-NEXT: kshiftrw $4, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r12d +; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: kshiftrw $5, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r15d +; KNL-NEXT: kmovw %k1, %r8d ; KNL-NEXT: kshiftrw $6, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r14d +; KNL-NEXT: kmovw %k1, %r10d ; KNL-NEXT: kshiftrw $7, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r13d +; KNL-NEXT: kmovw %k1, %r11d ; KNL-NEXT: kshiftrw $8, %k0, %k1 ; KNL-NEXT: kmovw %k1, %ebx ; KNL-NEXT: kshiftrw $9, %k0, %k1 -; KNL-NEXT: kmovw %k1, %esi +; KNL-NEXT: kmovw %k1, %r14d ; KNL-NEXT: kshiftrw $10, %k0, %k1 ; KNL-NEXT: kmovw %k1, %ebp ; KNL-NEXT: kshiftrw $11, %k0, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %r15d ; KNL-NEXT: kshiftrw $12, %k0, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %r12d ; KNL-NEXT: kshiftrw $13, %k0, %k1 -; KNL-NEXT: kmovw %k1, %edi +; KNL-NEXT: kmovw %k1, %r13d ; KNL-NEXT: kshiftrw $14, %k0, %k1 -; KNL-NEXT: andl $1, %r8d -; KNL-NEXT: movb %r8b, 2(%rax) -; KNL-NEXT: kmovw %k0, %r8d -; KNL-NEXT: andl $1, %r8d -; KNL-NEXT: andl $1, %r9d -; KNL-NEXT: leal (%r8,%r9,2), %r8d -; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: andl $1, %ecx +; KNL-NEXT: movb %cl, 2(%rax) +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: andl $1, %ecx +; KNL-NEXT: andl $1, %edx +; KNL-NEXT: leal (%rcx,%rdx,2), %ecx +; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: andl $1, %esi +; KNL-NEXT: leal (%rcx,%rsi,4), %ecx +; KNL-NEXT: kmovw %k0, %esi +; KNL-NEXT: andl $1, %edi +; KNL-NEXT: leal (%rcx,%rdi,8), %ecx +; KNL-NEXT: andl $1, %r9d +; KNL-NEXT: shll $4, %r9d +; KNL-NEXT: orl %ecx, %r9d +; KNL-NEXT: andl $1, %r8d +; KNL-NEXT: shll $5, %r8d +; KNL-NEXT: orl %r9d, %r8d ; KNL-NEXT: andl $1, %r10d -; KNL-NEXT: leal (%r8,%r10,4), %r8d -; KNL-NEXT: kmovw %k0, %r10d +; KNL-NEXT: shll $6, %r10d ; KNL-NEXT: andl $1, %r11d -; KNL-NEXT: leal (%r8,%r11,8), %r8d -; KNL-NEXT: andl $1, %r12d -; KNL-NEXT: shll $4, %r12d -; KNL-NEXT: orl %r8d, %r12d -; KNL-NEXT: andl $1, %r15d -; KNL-NEXT: shll $5, %r15d -; KNL-NEXT: orl %r12d, %r15d -; KNL-NEXT: andl $1, %r14d -; KNL-NEXT: shll $6, %r14d -; KNL-NEXT: andl $1, %r13d -; KNL-NEXT: shll $7, %r13d -; KNL-NEXT: orl %r14d, %r13d +; KNL-NEXT: shll $7, %r11d +; KNL-NEXT: orl %r10d, %r11d ; KNL-NEXT: andl $1, %ebx ; KNL-NEXT: shll $8, %ebx -; KNL-NEXT: orl %r13d, %ebx -; KNL-NEXT: andl $1, %esi -; KNL-NEXT: shll $9, %esi -; KNL-NEXT: orl %ebx, %esi +; KNL-NEXT: orl %r11d, %ebx +; KNL-NEXT: andl $1, %r14d +; KNL-NEXT: shll $9, %r14d +; KNL-NEXT: orl %ebx, %r14d ; KNL-NEXT: andl $1, %ebp ; KNL-NEXT: shll $10, %ebp -; KNL-NEXT: orl %esi, %ebp -; KNL-NEXT: orl %r15d, %ebp -; KNL-NEXT: andl $1, %ecx -; KNL-NEXT: shll $11, %ecx +; KNL-NEXT: orl %r14d, %ebp +; KNL-NEXT: orl %r8d, %ebp +; KNL-NEXT: andl $1, %r15d +; KNL-NEXT: shll $11, %r15d +; KNL-NEXT: andl $1, %r12d +; KNL-NEXT: shll $12, %r12d +; KNL-NEXT: orl %r15d, %r12d +; KNL-NEXT: andl $1, %r13d +; KNL-NEXT: shll $13, %r13d +; KNL-NEXT: orl %r12d, %r13d ; KNL-NEXT: andl $1, %edx -; KNL-NEXT: shll $12, %edx -; KNL-NEXT: orl %ecx, %edx -; KNL-NEXT: andl $1, %edi -; KNL-NEXT: shll $13, %edi -; KNL-NEXT: orl %edx, %edi -; KNL-NEXT: andl $1, %r9d -; KNL-NEXT: shll $14, %r9d -; KNL-NEXT: orl %edi, %r9d -; KNL-NEXT: andl $1, %r10d -; KNL-NEXT: shll $15, %r10d -; KNL-NEXT: orl %r9d, %r10d -; KNL-NEXT: orl %ebp, %r10d -; KNL-NEXT: movw %r10w, (%rax) +; KNL-NEXT: shll $14, %edx +; KNL-NEXT: orl %r13d, %edx +; KNL-NEXT: andl $1, %esi +; KNL-NEXT: shll $15, %esi +; KNL-NEXT: orl %edx, %esi +; KNL-NEXT: orl %ebp, %esi +; KNL-NEXT: movw %si, (%rax) ; KNL-NEXT: popq %rbx ; KNL-NEXT: popq %r12 ; KNL-NEXT: popq %r13 @@ -1223,84 +1223,84 @@ ; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload ; SKX-NEXT: kandd %k1, %k0, %k0 ; SKX-NEXT: kshiftrd $16, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r8d +; SKX-NEXT: kmovd %k1, %ecx ; SKX-NEXT: kshiftrd $1, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r9d +; SKX-NEXT: kmovd %k1, %edx ; SKX-NEXT: kshiftrd $2, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r10d +; SKX-NEXT: kmovd %k1, %esi ; SKX-NEXT: kshiftrd $3, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r11d +; SKX-NEXT: kmovd %k1, %edi ; SKX-NEXT: kshiftrd $4, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r12d +; SKX-NEXT: kmovd %k1, %r9d ; SKX-NEXT: kshiftrd $5, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r15d +; SKX-NEXT: kmovd %k1, %r8d ; SKX-NEXT: kshiftrd $6, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r14d +; SKX-NEXT: kmovd %k1, %r10d ; SKX-NEXT: kshiftrd $7, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r13d +; SKX-NEXT: kmovd %k1, %r11d ; SKX-NEXT: kshiftrd $8, %k0, %k1 ; SKX-NEXT: kmovd %k1, %ebx ; SKX-NEXT: kshiftrd $9, %k0, %k1 -; SKX-NEXT: kmovd %k1, %esi +; SKX-NEXT: kmovd %k1, %r14d ; SKX-NEXT: kshiftrd $10, %k0, %k1 ; SKX-NEXT: kmovd %k1, %ebp ; SKX-NEXT: kshiftrd $11, %k0, %k1 -; SKX-NEXT: kmovd %k1, %ecx +; SKX-NEXT: kmovd %k1, %r15d ; SKX-NEXT: kshiftrd $12, %k0, %k1 -; SKX-NEXT: kmovd %k1, %edx +; SKX-NEXT: kmovd %k1, %r12d ; SKX-NEXT: kshiftrd $13, %k0, %k1 -; SKX-NEXT: kmovd %k1, %edi +; SKX-NEXT: kmovd %k1, %r13d ; SKX-NEXT: kshiftrd $14, %k0, %k1 -; SKX-NEXT: andl $1, %r8d -; SKX-NEXT: movb %r8b, 2(%rax) -; SKX-NEXT: kmovd %k0, %r8d -; SKX-NEXT: andl $1, %r8d -; SKX-NEXT: andl $1, %r9d -; SKX-NEXT: leal (%r8,%r9,2), %r8d -; SKX-NEXT: kmovd %k1, %r9d +; SKX-NEXT: andl $1, %ecx +; SKX-NEXT: movb %cl, 2(%rax) +; SKX-NEXT: kmovd %k0, %ecx +; SKX-NEXT: andl $1, %ecx +; SKX-NEXT: andl $1, %edx +; SKX-NEXT: leal (%rcx,%rdx,2), %ecx +; SKX-NEXT: kmovd %k1, %edx ; SKX-NEXT: kshiftrd $15, %k0, %k0 +; SKX-NEXT: andl $1, %esi +; SKX-NEXT: leal (%rcx,%rsi,4), %ecx +; SKX-NEXT: kmovd %k0, %esi +; SKX-NEXT: andl $1, %edi +; SKX-NEXT: leal (%rcx,%rdi,8), %ecx +; SKX-NEXT: andl $1, %r9d +; SKX-NEXT: shll $4, %r9d +; SKX-NEXT: orl %ecx, %r9d +; SKX-NEXT: andl $1, %r8d +; SKX-NEXT: shll $5, %r8d +; SKX-NEXT: orl %r9d, %r8d ; SKX-NEXT: andl $1, %r10d -; SKX-NEXT: leal (%r8,%r10,4), %r8d -; SKX-NEXT: kmovd %k0, %r10d +; SKX-NEXT: shll $6, %r10d ; SKX-NEXT: andl $1, %r11d -; SKX-NEXT: leal (%r8,%r11,8), %r8d -; SKX-NEXT: andl $1, %r12d -; SKX-NEXT: shll $4, %r12d -; SKX-NEXT: orl %r8d, %r12d -; SKX-NEXT: andl $1, %r15d -; SKX-NEXT: shll $5, %r15d -; SKX-NEXT: orl %r12d, %r15d -; SKX-NEXT: andl $1, %r14d -; SKX-NEXT: shll $6, %r14d -; SKX-NEXT: andl $1, %r13d -; SKX-NEXT: shll $7, %r13d -; SKX-NEXT: orl %r14d, %r13d +; SKX-NEXT: shll $7, %r11d +; SKX-NEXT: orl %r10d, %r11d ; SKX-NEXT: andl $1, %ebx ; SKX-NEXT: shll $8, %ebx -; SKX-NEXT: orl %r13d, %ebx -; SKX-NEXT: andl $1, %esi -; SKX-NEXT: shll $9, %esi -; SKX-NEXT: orl %ebx, %esi +; SKX-NEXT: orl %r11d, %ebx +; SKX-NEXT: andl $1, %r14d +; SKX-NEXT: shll $9, %r14d +; SKX-NEXT: orl %ebx, %r14d ; SKX-NEXT: andl $1, %ebp ; SKX-NEXT: shll $10, %ebp -; SKX-NEXT: orl %esi, %ebp -; SKX-NEXT: orl %r15d, %ebp -; SKX-NEXT: andl $1, %ecx -; SKX-NEXT: shll $11, %ecx +; SKX-NEXT: orl %r14d, %ebp +; SKX-NEXT: orl %r8d, %ebp +; SKX-NEXT: andl $1, %r15d +; SKX-NEXT: shll $11, %r15d +; SKX-NEXT: andl $1, %r12d +; SKX-NEXT: shll $12, %r12d +; SKX-NEXT: orl %r15d, %r12d +; SKX-NEXT: andl $1, %r13d +; SKX-NEXT: shll $13, %r13d +; SKX-NEXT: orl %r12d, %r13d ; SKX-NEXT: andl $1, %edx -; SKX-NEXT: shll $12, %edx -; SKX-NEXT: orl %ecx, %edx -; SKX-NEXT: andl $1, %edi -; SKX-NEXT: shll $13, %edi -; SKX-NEXT: orl %edx, %edi -; SKX-NEXT: andl $1, %r9d -; SKX-NEXT: shll $14, %r9d -; SKX-NEXT: orl %edi, %r9d -; SKX-NEXT: andl $1, %r10d -; SKX-NEXT: shll $15, %r10d -; SKX-NEXT: orl %r9d, %r10d -; SKX-NEXT: orl %ebp, %r10d -; SKX-NEXT: movw %r10w, (%rax) +; SKX-NEXT: shll $14, %edx +; SKX-NEXT: orl %r13d, %edx +; SKX-NEXT: andl $1, %esi +; SKX-NEXT: shll $15, %esi +; SKX-NEXT: orl %edx, %esi +; SKX-NEXT: orl %ebp, %esi +; SKX-NEXT: movw %si, (%rax) ; SKX-NEXT: popq %rbx ; SKX-NEXT: popq %r12 ; SKX-NEXT: popq %r13 @@ -1864,84 +1864,84 @@ ; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload ; FASTISEL-NEXT: kandd %k1, %k0, %k0 ; FASTISEL-NEXT: kshiftrd $16, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r8d +; FASTISEL-NEXT: kmovd %k1, %ecx ; FASTISEL-NEXT: kshiftrd $1, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r9d +; FASTISEL-NEXT: kmovd %k1, %edx ; FASTISEL-NEXT: kshiftrd $2, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r10d +; FASTISEL-NEXT: kmovd %k1, %esi ; FASTISEL-NEXT: kshiftrd $3, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r11d +; FASTISEL-NEXT: kmovd %k1, %edi ; FASTISEL-NEXT: kshiftrd $4, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r12d +; FASTISEL-NEXT: kmovd %k1, %r9d ; FASTISEL-NEXT: kshiftrd $5, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r15d +; FASTISEL-NEXT: kmovd %k1, %r8d ; FASTISEL-NEXT: kshiftrd $6, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r14d +; FASTISEL-NEXT: kmovd %k1, %r10d ; FASTISEL-NEXT: kshiftrd $7, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r13d +; FASTISEL-NEXT: kmovd %k1, %r11d ; FASTISEL-NEXT: kshiftrd $8, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %ebx ; FASTISEL-NEXT: kshiftrd $9, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %esi +; FASTISEL-NEXT: kmovd %k1, %r14d ; FASTISEL-NEXT: kshiftrd $10, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %ebp ; FASTISEL-NEXT: kshiftrd $11, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %ecx +; FASTISEL-NEXT: kmovd %k1, %r15d ; FASTISEL-NEXT: kshiftrd $12, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %edx +; FASTISEL-NEXT: kmovd %k1, %r12d ; FASTISEL-NEXT: kshiftrd $13, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %edi +; FASTISEL-NEXT: kmovd %k1, %r13d ; FASTISEL-NEXT: kshiftrd $14, %k0, %k1 -; FASTISEL-NEXT: andl $1, %r8d -; FASTISEL-NEXT: movb %r8b, 2(%rax) -; FASTISEL-NEXT: kmovd %k0, %r8d -; FASTISEL-NEXT: andl $1, %r8d -; FASTISEL-NEXT: andl $1, %r9d -; FASTISEL-NEXT: leal (%r8,%r9,2), %r8d -; FASTISEL-NEXT: kmovd %k1, %r9d +; FASTISEL-NEXT: andl $1, %ecx +; FASTISEL-NEXT: movb %cl, 2(%rax) +; FASTISEL-NEXT: kmovd %k0, %ecx +; FASTISEL-NEXT: andl $1, %ecx +; FASTISEL-NEXT: andl $1, %edx +; FASTISEL-NEXT: leal (%rcx,%rdx,2), %ecx +; FASTISEL-NEXT: kmovd %k1, %edx ; FASTISEL-NEXT: kshiftrd $15, %k0, %k0 +; FASTISEL-NEXT: andl $1, %esi +; FASTISEL-NEXT: leal (%rcx,%rsi,4), %ecx +; FASTISEL-NEXT: kmovd %k0, %esi +; FASTISEL-NEXT: andl $1, %edi +; FASTISEL-NEXT: leal (%rcx,%rdi,8), %ecx +; FASTISEL-NEXT: andl $1, %r9d +; FASTISEL-NEXT: shll $4, %r9d +; FASTISEL-NEXT: orl %ecx, %r9d +; FASTISEL-NEXT: andl $1, %r8d +; FASTISEL-NEXT: shll $5, %r8d +; FASTISEL-NEXT: orl %r9d, %r8d ; FASTISEL-NEXT: andl $1, %r10d -; FASTISEL-NEXT: leal (%r8,%r10,4), %r8d -; FASTISEL-NEXT: kmovd %k0, %r10d +; FASTISEL-NEXT: shll $6, %r10d ; FASTISEL-NEXT: andl $1, %r11d -; FASTISEL-NEXT: leal (%r8,%r11,8), %r8d -; FASTISEL-NEXT: andl $1, %r12d -; FASTISEL-NEXT: shll $4, %r12d -; FASTISEL-NEXT: orl %r8d, %r12d -; FASTISEL-NEXT: andl $1, %r15d -; FASTISEL-NEXT: shll $5, %r15d -; FASTISEL-NEXT: orl %r12d, %r15d -; FASTISEL-NEXT: andl $1, %r14d -; FASTISEL-NEXT: shll $6, %r14d -; FASTISEL-NEXT: andl $1, %r13d -; FASTISEL-NEXT: shll $7, %r13d -; FASTISEL-NEXT: orl %r14d, %r13d +; FASTISEL-NEXT: shll $7, %r11d +; FASTISEL-NEXT: orl %r10d, %r11d ; FASTISEL-NEXT: andl $1, %ebx ; FASTISEL-NEXT: shll $8, %ebx -; FASTISEL-NEXT: orl %r13d, %ebx -; FASTISEL-NEXT: andl $1, %esi -; FASTISEL-NEXT: shll $9, %esi -; FASTISEL-NEXT: orl %ebx, %esi +; FASTISEL-NEXT: orl %r11d, %ebx +; FASTISEL-NEXT: andl $1, %r14d +; FASTISEL-NEXT: shll $9, %r14d +; FASTISEL-NEXT: orl %ebx, %r14d ; FASTISEL-NEXT: andl $1, %ebp ; FASTISEL-NEXT: shll $10, %ebp -; FASTISEL-NEXT: orl %esi, %ebp -; FASTISEL-NEXT: orl %r15d, %ebp -; FASTISEL-NEXT: andl $1, %ecx -; FASTISEL-NEXT: shll $11, %ecx +; FASTISEL-NEXT: orl %r14d, %ebp +; FASTISEL-NEXT: orl %r8d, %ebp +; FASTISEL-NEXT: andl $1, %r15d +; FASTISEL-NEXT: shll $11, %r15d +; FASTISEL-NEXT: andl $1, %r12d +; FASTISEL-NEXT: shll $12, %r12d +; FASTISEL-NEXT: orl %r15d, %r12d +; FASTISEL-NEXT: andl $1, %r13d +; FASTISEL-NEXT: shll $13, %r13d +; FASTISEL-NEXT: orl %r12d, %r13d ; FASTISEL-NEXT: andl $1, %edx -; FASTISEL-NEXT: shll $12, %edx -; FASTISEL-NEXT: orl %ecx, %edx -; FASTISEL-NEXT: andl $1, %edi -; FASTISEL-NEXT: shll $13, %edi -; FASTISEL-NEXT: orl %edx, %edi -; FASTISEL-NEXT: andl $1, %r9d -; FASTISEL-NEXT: shll $14, %r9d -; FASTISEL-NEXT: orl %edi, %r9d -; FASTISEL-NEXT: andl $1, %r10d -; FASTISEL-NEXT: shll $15, %r10d -; FASTISEL-NEXT: orl %r9d, %r10d -; FASTISEL-NEXT: orl %ebp, %r10d -; FASTISEL-NEXT: movw %r10w, (%rax) +; FASTISEL-NEXT: shll $14, %edx +; FASTISEL-NEXT: orl %r13d, %edx +; FASTISEL-NEXT: andl $1, %esi +; FASTISEL-NEXT: shll $15, %esi +; FASTISEL-NEXT: orl %edx, %esi +; FASTISEL-NEXT: orl %ebp, %esi +; FASTISEL-NEXT: movw %si, (%rax) ; FASTISEL-NEXT: popq %rbx ; FASTISEL-NEXT: popq %r12 ; FASTISEL-NEXT: popq %r13 @@ -2045,13 +2045,13 @@ ; KNL-NEXT: kshiftrw $9, %k6, %k6 ; KNL-NEXT: korw %k6, %k0, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; KNL-NEXT: andl $1, %r10d ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: andl $1, %edi +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; KNL-NEXT: kmovw %r10d, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kmovw %r10d, %k6 +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi @@ -2084,13 +2084,13 @@ ; KNL-NEXT: kshiftrw $9, %k6, %k6 ; KNL-NEXT: korw %k6, %k0, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; KNL-NEXT: andl $1, %r10d ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: andl $1, %edi +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; KNL-NEXT: kmovw %r10d, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kmovw %r10d, %k6 +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi @@ -2123,13 +2123,13 @@ ; KNL-NEXT: kshiftrw $9, %k6, %k6 ; KNL-NEXT: korw %k6, %k0, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; KNL-NEXT: andl $1, %r10d ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: andl $1, %edi +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; KNL-NEXT: kmovw %r10d, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kmovw %r10d, %k6 +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi @@ -2162,13 +2162,13 @@ ; KNL-NEXT: kshiftrw $9, %k6, %k6 ; KNL-NEXT: korw %k6, %k0, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; KNL-NEXT: andl $1, %r10d ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: andl $1, %edi +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; KNL-NEXT: kmovw %r10d, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kmovw %r10d, %k6 +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi @@ -2201,13 +2201,13 @@ ; KNL-NEXT: kshiftrw $9, %k6, %k6 ; KNL-NEXT: korw %k6, %k0, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; KNL-NEXT: andl $1, %r10d ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: andl $1, %edi +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; KNL-NEXT: kmovw %r10d, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kmovw %r10d, %k6 +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi @@ -2327,38 +2327,38 @@ ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r8d +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftrw $5, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftrw $4, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r10d +; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftrw $3, %k0, %k1 ; KNL-NEXT: kmovw %k1, %edi ; KNL-NEXT: kshiftrw $2, %k0, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %r8d ; KNL-NEXT: kshiftrw $1, %k0, %k1 -; KNL-NEXT: kmovw %k1, %edx -; KNL-NEXT: kmovw %k0, %esi -; KNL-NEXT: andb $1, %sil -; KNL-NEXT: andb $1, %dl -; KNL-NEXT: addb %dl, %dl -; KNL-NEXT: orb %sil, %dl -; KNL-NEXT: andb $1, %cl -; KNL-NEXT: shlb $2, %cl -; KNL-NEXT: orb %dl, %cl -; KNL-NEXT: andb $1, %dil -; KNL-NEXT: shlb $3, %dil -; KNL-NEXT: orb %cl, %dil +; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: kmovw %k0, %r10d ; KNL-NEXT: andb $1, %r10b -; KNL-NEXT: shlb $4, %r10b -; KNL-NEXT: orb %dil, %r10b ; KNL-NEXT: andb $1, %r9b -; KNL-NEXT: shlb $5, %r9b +; KNL-NEXT: addb %r9b, %r9b ; KNL-NEXT: orb %r10b, %r9b -; KNL-NEXT: shlb $6, %r8b +; KNL-NEXT: andb $1, %r8b +; KNL-NEXT: shlb $2, %r8b ; KNL-NEXT: orb %r9b, %r8b -; KNL-NEXT: andb $127, %r8b -; KNL-NEXT: movb %r8b, (%rax) +; KNL-NEXT: andb $1, %dil +; KNL-NEXT: shlb $3, %dil +; KNL-NEXT: orb %r8b, %dil +; KNL-NEXT: andb $1, %sil +; KNL-NEXT: shlb $4, %sil +; KNL-NEXT: orb %dil, %sil +; KNL-NEXT: andb $1, %dl +; KNL-NEXT: shlb $5, %dl +; KNL-NEXT: orb %sil, %dl +; KNL-NEXT: shlb $6, %cl +; KNL-NEXT: orb %dl, %cl +; KNL-NEXT: andb $127, %cl +; KNL-NEXT: movb %cl, (%rax) ; KNL-NEXT: retq ; ; SKX-LABEL: test17: @@ -2705,38 +2705,38 @@ ; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload ; SKX-NEXT: kandb %k1, %k0, %k0 ; SKX-NEXT: kshiftrb $6, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r8d +; SKX-NEXT: kmovd %k1, %ecx ; SKX-NEXT: kshiftrb $5, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r9d +; SKX-NEXT: kmovd %k1, %edx ; SKX-NEXT: kshiftrb $4, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r10d +; SKX-NEXT: kmovd %k1, %esi ; SKX-NEXT: kshiftrb $3, %k0, %k1 ; SKX-NEXT: kmovd %k1, %edi ; SKX-NEXT: kshiftrb $2, %k0, %k1 -; SKX-NEXT: kmovd %k1, %ecx +; SKX-NEXT: kmovd %k1, %r8d ; SKX-NEXT: kshiftrb $1, %k0, %k1 -; SKX-NEXT: kmovd %k1, %edx -; SKX-NEXT: kmovd %k0, %esi -; SKX-NEXT: andb $1, %sil -; SKX-NEXT: andb $1, %dl -; SKX-NEXT: addb %dl, %dl -; SKX-NEXT: orb %sil, %dl -; SKX-NEXT: andb $1, %cl -; SKX-NEXT: shlb $2, %cl -; SKX-NEXT: orb %dl, %cl -; SKX-NEXT: andb $1, %dil -; SKX-NEXT: shlb $3, %dil -; SKX-NEXT: orb %cl, %dil +; SKX-NEXT: kmovd %k1, %r9d +; SKX-NEXT: kmovd %k0, %r10d ; SKX-NEXT: andb $1, %r10b -; SKX-NEXT: shlb $4, %r10b -; SKX-NEXT: orb %dil, %r10b ; SKX-NEXT: andb $1, %r9b -; SKX-NEXT: shlb $5, %r9b +; SKX-NEXT: addb %r9b, %r9b ; SKX-NEXT: orb %r10b, %r9b -; SKX-NEXT: shlb $6, %r8b +; SKX-NEXT: andb $1, %r8b +; SKX-NEXT: shlb $2, %r8b ; SKX-NEXT: orb %r9b, %r8b -; SKX-NEXT: andb $127, %r8b -; SKX-NEXT: movb %r8b, (%rax) +; SKX-NEXT: andb $1, %dil +; SKX-NEXT: shlb $3, %dil +; SKX-NEXT: orb %r8b, %dil +; SKX-NEXT: andb $1, %sil +; SKX-NEXT: shlb $4, %sil +; SKX-NEXT: orb %dil, %sil +; SKX-NEXT: andb $1, %dl +; SKX-NEXT: shlb $5, %dl +; SKX-NEXT: orb %sil, %dl +; SKX-NEXT: shlb $6, %cl +; SKX-NEXT: orb %dl, %cl +; SKX-NEXT: andb $127, %cl +; SKX-NEXT: movb %cl, (%rax) ; SKX-NEXT: retq ; ; KNL_X32-LABEL: test17: @@ -3494,38 +3494,38 @@ ; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload ; FASTISEL-NEXT: kandb %k1, %k0, %k0 ; FASTISEL-NEXT: kshiftrb $6, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r8d +; FASTISEL-NEXT: kmovd %k1, %ecx ; FASTISEL-NEXT: kshiftrb $5, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r9d +; FASTISEL-NEXT: kmovd %k1, %edx ; FASTISEL-NEXT: kshiftrb $4, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r10d +; FASTISEL-NEXT: kmovd %k1, %esi ; FASTISEL-NEXT: kshiftrb $3, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %edi ; FASTISEL-NEXT: kshiftrb $2, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %ecx +; FASTISEL-NEXT: kmovd %k1, %r8d ; FASTISEL-NEXT: kshiftrb $1, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %edx -; FASTISEL-NEXT: kmovd %k0, %esi -; FASTISEL-NEXT: andb $1, %sil -; FASTISEL-NEXT: andb $1, %dl -; FASTISEL-NEXT: addb %dl, %dl -; FASTISEL-NEXT: orb %sil, %dl -; FASTISEL-NEXT: andb $1, %cl -; FASTISEL-NEXT: shlb $2, %cl -; FASTISEL-NEXT: orb %dl, %cl -; FASTISEL-NEXT: andb $1, %dil -; FASTISEL-NEXT: shlb $3, %dil -; FASTISEL-NEXT: orb %cl, %dil +; FASTISEL-NEXT: kmovd %k1, %r9d +; FASTISEL-NEXT: kmovd %k0, %r10d ; FASTISEL-NEXT: andb $1, %r10b -; FASTISEL-NEXT: shlb $4, %r10b -; FASTISEL-NEXT: orb %dil, %r10b ; FASTISEL-NEXT: andb $1, %r9b -; FASTISEL-NEXT: shlb $5, %r9b +; FASTISEL-NEXT: addb %r9b, %r9b ; FASTISEL-NEXT: orb %r10b, %r9b -; FASTISEL-NEXT: shlb $6, %r8b +; FASTISEL-NEXT: andb $1, %r8b +; FASTISEL-NEXT: shlb $2, %r8b ; FASTISEL-NEXT: orb %r9b, %r8b -; FASTISEL-NEXT: andb $127, %r8b -; FASTISEL-NEXT: movb %r8b, (%rax) +; FASTISEL-NEXT: andb $1, %dil +; FASTISEL-NEXT: shlb $3, %dil +; FASTISEL-NEXT: orb %r8b, %dil +; FASTISEL-NEXT: andb $1, %sil +; FASTISEL-NEXT: shlb $4, %sil +; FASTISEL-NEXT: orb %dil, %sil +; FASTISEL-NEXT: andb $1, %dl +; FASTISEL-NEXT: shlb $5, %dl +; FASTISEL-NEXT: orb %sil, %dl +; FASTISEL-NEXT: shlb $6, %cl +; FASTISEL-NEXT: orb %dl, %cl +; FASTISEL-NEXT: andb $127, %cl +; FASTISEL-NEXT: movb %cl, (%rax) ; FASTISEL-NEXT: retq %j = and <7 x i1> %a, %b %k = and <7 x i1> %j, %c diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll --- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -986,11 +986,8 @@ ; ; WIN64-LABEL: testi32_inp: ; WIN64: # %bb.0: -; WIN64-NEXT: pushq %r13 -; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx ; WIN64-NEXT: # kill: def $edx killed $edx def $rdx -; WIN64-NEXT: movl %ecx, %ebx ; WIN64-NEXT: # kill: def $esi killed $esi def $rsi ; WIN64-NEXT: # kill: def $r15d killed $r15d def $r15 ; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14 @@ -1000,45 +997,40 @@ ; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9 ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi -; WIN64-NEXT: leal (%rdx,%rdi), %r13d +; WIN64-NEXT: leal (%rdx,%rdi), %ebx ; WIN64-NEXT: # kill: def $edx killed $edx killed $rdx ; WIN64-NEXT: subl %edi, %edx -; WIN64-NEXT: leal (%rsi,%r8), %ecx +; WIN64-NEXT: leal (%rsi,%r8), %edi ; WIN64-NEXT: # kill: def $esi killed $esi killed $rsi ; WIN64-NEXT: subl %r8d, %esi ; WIN64-NEXT: leal (%r9,%r10), %r8d -; WIN64-NEXT: movl %r9d, %ebp -; WIN64-NEXT: subl %r10d, %ebp -; WIN64-NEXT: movl %eax, %edi -; WIN64-NEXT: movl %ebx, %r9d -; WIN64-NEXT: subl %ebx, %edi -; WIN64-NEXT: imull %edi, %ebp -; WIN64-NEXT: leal (%r11,%r12), %edi -; WIN64-NEXT: movl %r11d, %ebx -; WIN64-NEXT: subl %r12d, %ebx -; WIN64-NEXT: imull %edx, %ebx -; WIN64-NEXT: addl %ebp, %ebx +; WIN64-NEXT: # kill: def $r9d killed $r9d killed $r9 +; WIN64-NEXT: subl %r10d, %r9d +; WIN64-NEXT: movl %eax, %r10d +; WIN64-NEXT: subl %ecx, %r10d +; WIN64-NEXT: imull %r10d, %r9d +; WIN64-NEXT: leal (%r11,%r12), %r10d +; WIN64-NEXT: # kill: def $r11d killed $r11d killed $r11 +; WIN64-NEXT: subl %r12d, %r11d +; WIN64-NEXT: imull %edx, %r11d +; WIN64-NEXT: addl %r9d, %r11d ; WIN64-NEXT: leal (%r14,%r15), %edx -; WIN64-NEXT: movl %r14d, %ebp -; WIN64-NEXT: subl %r15d, %ebp -; WIN64-NEXT: imull %esi, %ebp -; WIN64-NEXT: addl %ebx, %ebp -; WIN64-NEXT: addl %r9d, %eax +; WIN64-NEXT: movl %r14d, %r9d +; WIN64-NEXT: subl %r15d, %r9d +; WIN64-NEXT: imull %esi, %r9d +; WIN64-NEXT: addl %r11d, %r9d +; WIN64-NEXT: addl %ecx, %eax ; WIN64-NEXT: imull %r8d, %eax -; WIN64-NEXT: imull %r13d, %edi -; WIN64-NEXT: addl %edi, %eax -; WIN64-NEXT: imull %ecx, %edx +; WIN64-NEXT: imull %ebx, %r10d +; WIN64-NEXT: addl %r10d, %eax +; WIN64-NEXT: imull %edi, %edx ; WIN64-NEXT: addl %edx, %eax -; WIN64-NEXT: addl %ebp, %eax +; WIN64-NEXT: addl %r9d, %eax ; WIN64-NEXT: popq %rbx -; WIN64-NEXT: popq %rbp -; WIN64-NEXT: popq %r13 ; WIN64-NEXT: retq ; ; LINUXOSX64-LABEL: testi32_inp: ; LINUXOSX64: # %bb.0: -; LINUXOSX64-NEXT: pushq %rbp -; LINUXOSX64-NEXT: pushq %rbx ; LINUXOSX64-NEXT: # kill: def $edx killed $edx def $rdx ; LINUXOSX64-NEXT: # kill: def $esi killed $esi def $rsi ; LINUXOSX64-NEXT: # kill: def $r14d killed $r14d def $r14 @@ -1048,37 +1040,35 @@ ; LINUXOSX64-NEXT: # kill: def $r8d killed $r8d def $r8 ; LINUXOSX64-NEXT: # kill: def $edi killed $edi def $rdi ; LINUXOSX64-NEXT: leal (%rdx,%rdi), %r10d -; LINUXOSX64-NEXT: movl %edx, %ebp -; LINUXOSX64-NEXT: subl %edi, %ebp -; LINUXOSX64-NEXT: leal (%rsi,%r8), %r11d +; LINUXOSX64-NEXT: # kill: def $edx killed $edx killed $rdx +; LINUXOSX64-NEXT: subl %edi, %edx +; LINUXOSX64-NEXT: leal (%rsi,%r8), %edi ; LINUXOSX64-NEXT: # kill: def $esi killed $esi killed $rsi ; LINUXOSX64-NEXT: subl %r8d, %esi ; LINUXOSX64-NEXT: leal (%r9,%r12), %r8d -; LINUXOSX64-NEXT: movl %r9d, %edi -; LINUXOSX64-NEXT: subl %r12d, %edi -; LINUXOSX64-NEXT: movl %eax, %edx -; LINUXOSX64-NEXT: subl %ecx, %edx -; LINUXOSX64-NEXT: imull %edx, %edi -; LINUXOSX64-NEXT: leal (%r13,%r14), %edx -; LINUXOSX64-NEXT: movl %r13d, %ebx -; LINUXOSX64-NEXT: subl %r14d, %ebx -; LINUXOSX64-NEXT: imull %ebp, %ebx -; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %ebp -; LINUXOSX64-NEXT: addl %edi, %ebx -; LINUXOSX64-NEXT: movl %r15d, %edi -; LINUXOSX64-NEXT: subl %ebp, %edi -; LINUXOSX64-NEXT: imull %esi, %edi -; LINUXOSX64-NEXT: addl %ebx, %edi +; LINUXOSX64-NEXT: # kill: def $r9d killed $r9d killed $r9 +; LINUXOSX64-NEXT: subl %r12d, %r9d +; LINUXOSX64-NEXT: movl %eax, %r11d +; LINUXOSX64-NEXT: subl %ecx, %r11d +; LINUXOSX64-NEXT: imull %r11d, %r9d +; LINUXOSX64-NEXT: leal (%r13,%r14), %r11d +; LINUXOSX64-NEXT: movl %r13d, %r12d +; LINUXOSX64-NEXT: subl %r14d, %r12d +; LINUXOSX64-NEXT: imull %edx, %r12d +; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %edx +; LINUXOSX64-NEXT: addl %r9d, %r12d +; LINUXOSX64-NEXT: movl %r15d, %r9d +; LINUXOSX64-NEXT: subl %edx, %r9d +; LINUXOSX64-NEXT: imull %esi, %r9d +; LINUXOSX64-NEXT: addl %r12d, %r9d ; LINUXOSX64-NEXT: addl %ecx, %eax ; LINUXOSX64-NEXT: imull %r8d, %eax -; LINUXOSX64-NEXT: imull %r10d, %edx +; LINUXOSX64-NEXT: imull %r10d, %r11d +; LINUXOSX64-NEXT: addl %r11d, %eax +; LINUXOSX64-NEXT: addl %r15d, %edx +; LINUXOSX64-NEXT: imull %edi, %edx ; LINUXOSX64-NEXT: addl %edx, %eax -; LINUXOSX64-NEXT: addl %r15d, %ebp -; LINUXOSX64-NEXT: imull %r11d, %ebp -; LINUXOSX64-NEXT: addl %ebp, %eax -; LINUXOSX64-NEXT: addl %edi, %eax -; LINUXOSX64-NEXT: popq %rbx -; LINUXOSX64-NEXT: popq %rbp +; LINUXOSX64-NEXT: addl %r9d, %eax ; LINUXOSX64-NEXT: retq %x1 = sub i32 %a1, %a2 %x2 = sub i32 %a3, %a4 diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -4846,7 +4846,7 @@ ; X64-LABEL: test_cmp_b_256: ; X64: # %bb.0: ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] -; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02] @@ -4856,15 +4856,15 @@ ; X64-NEXT: vpcmpnltb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x05] ; X64-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] ; X64-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] ; X64-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] ; X64-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01] -; X64-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x02] +; X64-NEXT: vpinsrd $2, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc0,0x02] ; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; X64-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] ; X64-NEXT: # xmm0 = xmm0[0,1,2],xmm1[3] ; X64-NEXT: vmovd %ecx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9] -; X64-NEXT: vmovd %r8d, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd0] +; X64-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] ; X64-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9] ; X64-NEXT: # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X64-NEXT: vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2] @@ -4946,26 +4946,26 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] -; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02] -; X64-NEXT: kmovd %k0, %r9d # encoding: [0xc5,0x7b,0x93,0xc8] +; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X64-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04] ; X64-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X64-NEXT: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x05] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] ; X64-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1] -; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X64-NEXT: kmovd %k0, %r9d # encoding: [0xc5,0x7b,0x93,0xc8] ; X64-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] -; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01] -; X64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02] +; X64-NEXT: vpinsrd $1, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc0,0x01] +; X64-NEXT: vpinsrd $2, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc1,0x02] ; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03] ; X64-NEXT: vmovd %ecx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9] -; X64-NEXT: vmovd %r8d, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd0] +; X64-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] ; X64-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9] ; X64-NEXT: # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-NEXT: vmovd %r9d, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd1] +; X64-NEXT: vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2] ; X64-NEXT: vpunpcklqdq %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca] ; X64-NEXT: # xmm1 = xmm1[0],xmm2[0] ; X64-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] @@ -5040,7 +5040,7 @@ ; X64-LABEL: test_ucmp_b_256: ; X64: # %bb.0: ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] -; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02] @@ -5050,15 +5050,15 @@ ; X64-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x05] ; X64-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] ; X64-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x06] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] ; X64-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] ; X64-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01] -; X64-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x02] +; X64-NEXT: vpinsrd $2, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc0,0x02] ; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; X64-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] ; X64-NEXT: # xmm0 = xmm0[0,1,2],xmm1[3] ; X64-NEXT: vmovd %ecx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9] -; X64-NEXT: vmovd %r8d, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd0] +; X64-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] ; X64-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9] ; X64-NEXT: # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X64-NEXT: vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2] @@ -5140,26 +5140,26 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] -; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02] -; X64-NEXT: kmovd %k0, %r9d # encoding: [0xc5,0x7b,0x93,0xc8] +; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X64-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04] ; X64-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X64-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] ; X64-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06] -; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X64-NEXT: kmovd %k0, %r9d # encoding: [0xc5,0x7b,0x93,0xc8] ; X64-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] -; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01] -; X64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02] +; X64-NEXT: vpinsrd $1, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc0,0x01] +; X64-NEXT: vpinsrd $2, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc1,0x02] ; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03] ; X64-NEXT: vmovd %ecx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9] -; X64-NEXT: vmovd %r8d, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd0] +; X64-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] ; X64-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9] ; X64-NEXT: # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-NEXT: vmovd %r9d, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd1] +; X64-NEXT: vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2] ; X64-NEXT: vpunpcklqdq %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca] ; X64-NEXT: # xmm1 = xmm1[0],xmm2[0] ; X64-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -49,8 +49,8 @@ ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: movq %rdx, %r14 -; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: movq %rsi, %r14 ; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq __truncdfbf2@PLT ; CHECK-NEXT: movd %xmm0, %ebp @@ -67,7 +67,7 @@ ; CHECK-NEXT: shll $16, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: cvtss2sd %xmm0, %xmm0 -; CHECK-NEXT: movsd %xmm0, (%r14) +; CHECK-NEXT: movsd %xmm0, (%rbx) ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %rbp @@ -207,63 +207,63 @@ ; CHECK-NEXT: shrq $48, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; CHECK-NEXT: movq %xmm0, %rbx -; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: movq %xmm0, %r12 +; CHECK-NEXT: movq %r12, %rax ; CHECK-NEXT: shrq $32, %rax ; CHECK-NEXT: movq %rax, (%rsp) # 8-byte Spill ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; CHECK-NEXT: movq %xmm0, %rbp -; CHECK-NEXT: movq %rbp, %r15 -; CHECK-NEXT: shrq $32, %r15 -; CHECK-NEXT: movq %rbx, %r13 +; CHECK-NEXT: movq %xmm0, %r14 +; CHECK-NEXT: movq %r14, %rbp +; CHECK-NEXT: shrq $32, %rbp +; CHECK-NEXT: movq %r12, %r15 +; CHECK-NEXT: shrq $48, %r15 +; CHECK-NEXT: movq %r14, %r13 ; CHECK-NEXT: shrq $48, %r13 -; CHECK-NEXT: movq %rbp, %r12 -; CHECK-NEXT: shrq $48, %r12 -; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: movl %r14d, %eax ; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 ; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: movl %ebx, %eax +; CHECK-NEXT: movl %r12d, %eax ; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfbf2@PLT -; CHECK-NEXT: movd %xmm0, %r14d -; CHECK-NEXT: shll $16, %r14d -; CHECK-NEXT: shll $16, %ebp -; CHECK-NEXT: movd %ebp, %xmm1 +; CHECK-NEXT: movd %xmm0, %ebx ; CHECK-NEXT: shll $16, %ebx -; CHECK-NEXT: movd %ebx, %xmm0 +; CHECK-NEXT: shll $16, %r14d +; CHECK-NEXT: movd %r14d, %xmm1 +; CHECK-NEXT: shll $16, %r12d +; CHECK-NEXT: movd %r12d, %xmm0 ; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfbf2@PLT ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: movzwl %ax, %ebx -; CHECK-NEXT: orl %r14d, %ebx -; CHECK-NEXT: shll $16, %r12d -; CHECK-NEXT: movd %r12d, %xmm1 +; CHECK-NEXT: movzwl %ax, %r12d +; CHECK-NEXT: orl %ebx, %r12d ; CHECK-NEXT: shll $16, %r13d -; CHECK-NEXT: movd %r13d, %xmm0 +; CHECK-NEXT: movd %r13d, %xmm1 +; CHECK-NEXT: shll $16, %r15d +; CHECK-NEXT: movd %r15d, %xmm0 ; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfbf2@PLT -; CHECK-NEXT: movd %xmm0, %ebp +; CHECK-NEXT: movd %xmm0, %r14d +; CHECK-NEXT: shll $16, %r14d ; CHECK-NEXT: shll $16, %ebp -; CHECK-NEXT: shll $16, %r15d -; CHECK-NEXT: movd %r15d, %xmm1 +; CHECK-NEXT: movd %ebp, %xmm1 ; CHECK-NEXT: movq (%rsp), %rax # 8-byte Reload ; CHECK-NEXT: shll $16, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfbf2@PLT ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: movzwl %ax, %r14d -; CHECK-NEXT: orl %ebp, %r14d -; CHECK-NEXT: shlq $32, %r14 -; CHECK-NEXT: orq %rbx, %r14 +; CHECK-NEXT: movzwl %ax, %ebx +; CHECK-NEXT: orl %r14d, %ebx +; CHECK-NEXT: shlq $32, %rbx +; CHECK-NEXT: orq %r12, %rbx ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload ; CHECK-NEXT: movl %r15d, %eax ; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 ; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; CHECK-NEXT: movl %ebx, %eax +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; CHECK-NEXT: movl %r14d, %eax ; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: addss %xmm1, %xmm0 @@ -273,14 +273,14 @@ ; CHECK-NEXT: movq %r15, %rax ; CHECK-NEXT: shll $16, %eax ; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: movq %r14, %rax ; CHECK-NEXT: shll $16, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: callq __truncsfbf2@PLT ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: movzwl %ax, %ebx -; CHECK-NEXT: orl %ebp, %ebx +; CHECK-NEXT: movzwl %ax, %r14d +; CHECK-NEXT: orl %ebp, %r14d ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-NEXT: shll $16, %eax ; CHECK-NEXT: movd %eax, %xmm1 @@ -303,9 +303,9 @@ ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: orl %ebp, %eax ; CHECK-NEXT: shlq $32, %rax -; CHECK-NEXT: orq %rbx, %rax +; CHECK-NEXT: orq %r14, %rax ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movq %r14, %xmm1 +; CHECK-NEXT: movq %rbx, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: addq $56, %rsp ; CHECK-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll @@ -38,13 +38,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm9 ; AVX1-NEXT: vpcmpgtq %xmm8, %xmm9, %xmm8 ; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm8 +; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-NEXT: vpackssdw %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -222,18 +222,18 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 ; AVX1-NEXT: vpcmpgtw %xmm8, %xmm9, %xmm8 ; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm8 +; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 +; AVX1-NEXT: vpcmpgtw %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2 -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpcmpgtw %xmm7, %xmm5, %xmm2 -; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 +; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtw %xmm7, %xmm5, %xmm3 +; AVX1-NEXT: vpacksswb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 @@ -330,13 +330,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 ; AVX1-NEXT: vpcmpgtd %xmm8, %xmm9, %xmm8 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm8, %xmm1, %xmm8 +; AVX1-NEXT: vpackssdw %xmm8, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm8, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 @@ -508,28 +508,28 @@ ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 ; AVX1-NEXT: vpcmpgtb %xmm8, %xmm9, %xmm8 -; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9 +; AVX1-NEXT: vpcmpgtb %xmm3, %xmm9, %xmm3 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 -; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm9 +; AVX1-NEXT: vpcmpgtb %xmm2, %xmm9, %xmm2 ; AVX1-NEXT: vpand %xmm2, %xmm8, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm7, %xmm5, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3 +; AVX1-NEXT: vpcmpgtb %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm7 ; AVX1-NEXT: vpcmpgtb %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtb %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: vpmovmskb %xmm1, %ecx +; AVX1-NEXT: vpmovmskb %xmm3, %ecx ; AVX1-NEXT: shll $16, %ecx ; AVX1-NEXT: orl %eax, %ecx -; AVX1-NEXT: vpmovmskb %xmm3, %edx +; AVX1-NEXT: vpmovmskb %xmm1, %edx ; AVX1-NEXT: vpmovmskb %xmm2, %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %edx, %eax diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -1010,7 +1010,6 @@ ; ; X64-LABEL: large_promotion: ; X64: # %bb.0: -; X64-NEXT: pushq %rbp ; X64-NEXT: pushq %r15 ; X64-NEXT: pushq %r14 ; X64-NEXT: pushq %r13 @@ -1019,189 +1018,188 @@ ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; X64-NEXT: bswapq %rdi -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: shrq $4, %rbx -; X64-NEXT: movabsq $1085102592571150095, %r13 # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %r13, %rbx -; X64-NEXT: andq %r13, %rdi +; X64-NEXT: movq %rdi, %r10 +; X64-NEXT: shrq $4, %r10 +; X64-NEXT: movabsq $1085102592571150095, %r11 # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %r11, %r10 +; X64-NEXT: andq %r11, %rdi ; X64-NEXT: shlq $4, %rdi -; X64-NEXT: orq %rbx, %rdi -; X64-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333 -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: andq %r11, %rbx +; X64-NEXT: orq %r10, %rdi +; X64-NEXT: movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333 +; X64-NEXT: movq %rdi, %r14 +; X64-NEXT: andq %r10, %r14 ; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %r11, %rdi -; X64-NEXT: leaq (%rdi,%rbx,4), %rdi -; X64-NEXT: movabsq $6148820866244280320, %r10 # imm = 0x5555000000000000 -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: andq %r10, %rbx -; X64-NEXT: shrq %rdi ; X64-NEXT: andq %r10, %rdi -; X64-NEXT: leaq (%rdi,%rbx,2), %r10 -; X64-NEXT: bswapq %rbp -; X64-NEXT: movq %rbp, %rdi -; X64-NEXT: shrq $4, %rdi -; X64-NEXT: andq %r13, %rdi -; X64-NEXT: andq %r13, %rbp -; X64-NEXT: shlq $4, %rbp -; X64-NEXT: orq %rdi, %rbp -; X64-NEXT: movq %rbp, %rdi -; X64-NEXT: andq %r11, %rdi -; X64-NEXT: shrq $2, %rbp -; X64-NEXT: andq %r11, %rbp -; X64-NEXT: leaq (%rbp,%rdi,4), %rdi -; X64-NEXT: movabsq $6148914691236517205, %rbp # imm = 0x5555555555555555 -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: andq %rbp, %rbx +; X64-NEXT: leaq (%rdi,%r14,4), %rdi +; X64-NEXT: movabsq $6148820866244280320, %r14 # imm = 0x5555000000000000 +; X64-NEXT: movq %rdi, %r13 +; X64-NEXT: andq %r14, %r13 ; X64-NEXT: shrq %rdi -; X64-NEXT: andq %rbp, %rdi -; X64-NEXT: leaq (%rdi,%rbx,2), %r14 -; X64-NEXT: shrdq $48, %r14, %r10 +; X64-NEXT: andq %r14, %rdi +; X64-NEXT: leaq (%rdi,%r13,2), %rdi +; X64-NEXT: bswapq %rbx +; X64-NEXT: movq %rbx, %r14 +; X64-NEXT: shrq $4, %r14 +; X64-NEXT: andq %r11, %r14 +; X64-NEXT: andq %r11, %rbx +; X64-NEXT: shlq $4, %rbx +; X64-NEXT: orq %r14, %rbx +; X64-NEXT: movq %rbx, %r14 +; X64-NEXT: andq %r10, %r14 +; X64-NEXT: shrq $2, %rbx +; X64-NEXT: andq %r10, %rbx +; X64-NEXT: leaq (%rbx,%r14,4), %rbx +; X64-NEXT: movabsq $6148914691236517205, %r14 # imm = 0x5555555555555555 +; X64-NEXT: movq %rbx, %r13 +; X64-NEXT: andq %r14, %r13 +; X64-NEXT: shrq %rbx +; X64-NEXT: andq %r14, %rbx +; X64-NEXT: leaq (%rbx,%r13,2), %rbx +; X64-NEXT: shrdq $48, %rbx, %rdi ; X64-NEXT: bswapq %r15 -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: shrq $4, %rdi -; X64-NEXT: andq %r13, %rdi -; X64-NEXT: andq %r13, %r15 +; X64-NEXT: movq %r15, %r13 +; X64-NEXT: shrq $4, %r13 +; X64-NEXT: andq %r11, %r13 +; X64-NEXT: andq %r11, %r15 ; X64-NEXT: shlq $4, %r15 -; X64-NEXT: orq %rdi, %r15 -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: andq %r11, %rdi +; X64-NEXT: orq %r13, %r15 +; X64-NEXT: movq %r15, %r13 +; X64-NEXT: andq %r10, %r13 ; X64-NEXT: shrq $2, %r15 -; X64-NEXT: andq %r11, %r15 -; X64-NEXT: leaq (%r15,%rdi,4), %rdi -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: andq %rbp, %rbx -; X64-NEXT: shrq %rdi -; X64-NEXT: andq %rbp, %rdi -; X64-NEXT: leaq (%rdi,%rbx,2), %r15 -; X64-NEXT: shrdq $48, %r15, %r14 +; X64-NEXT: andq %r10, %r15 +; X64-NEXT: leaq (%r15,%r13,4), %r15 +; X64-NEXT: movq %r15, %r13 +; X64-NEXT: andq %r14, %r13 +; X64-NEXT: shrq %r15 +; X64-NEXT: andq %r14, %r15 +; X64-NEXT: leaq (%r15,%r13,2), %r15 +; X64-NEXT: shrdq $48, %r15, %rbx ; X64-NEXT: bswapq %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: shrq $4, %rdi -; X64-NEXT: andq %r13, %rdi -; X64-NEXT: andq %r13, %r12 +; X64-NEXT: movq %r12, %r13 +; X64-NEXT: shrq $4, %r13 +; X64-NEXT: andq %r11, %r13 +; X64-NEXT: andq %r11, %r12 ; X64-NEXT: shlq $4, %r12 -; X64-NEXT: orq %rdi, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: andq %r11, %rdi +; X64-NEXT: orq %r13, %r12 +; X64-NEXT: movq %r12, %r13 +; X64-NEXT: andq %r10, %r13 ; X64-NEXT: shrq $2, %r12 -; X64-NEXT: andq %r11, %r12 -; X64-NEXT: leaq (%r12,%rdi,4), %rdi -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: andq %rbp, %rbx -; X64-NEXT: shrq %rdi -; X64-NEXT: andq %rbp, %rdi -; X64-NEXT: leaq (%rdi,%rbx,2), %r12 +; X64-NEXT: andq %r10, %r12 +; X64-NEXT: leaq (%r12,%r13,4), %r12 +; X64-NEXT: movq %r12, %r13 +; X64-NEXT: andq %r14, %r13 +; X64-NEXT: shrq %r12 +; X64-NEXT: andq %r14, %r12 +; X64-NEXT: leaq (%r12,%r13,2), %r12 ; X64-NEXT: shrdq $48, %r12, %r15 ; X64-NEXT: bswapq %r9 -; X64-NEXT: movq %r9, %rdi -; X64-NEXT: shrq $4, %rdi -; X64-NEXT: andq %r13, %rdi -; X64-NEXT: andq %r13, %r9 +; X64-NEXT: movq %r9, %r13 +; X64-NEXT: shrq $4, %r13 +; X64-NEXT: andq %r11, %r13 +; X64-NEXT: andq %r11, %r9 ; X64-NEXT: shlq $4, %r9 -; X64-NEXT: orq %rdi, %r9 -; X64-NEXT: movq %r9, %rdi -; X64-NEXT: andq %r11, %rdi +; X64-NEXT: orq %r13, %r9 +; X64-NEXT: movq %r9, %r13 +; X64-NEXT: andq %r10, %r13 ; X64-NEXT: shrq $2, %r9 -; X64-NEXT: andq %r11, %r9 -; X64-NEXT: leaq (%r9,%rdi,4), %rdi -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: andq %rbp, %rbx -; X64-NEXT: shrq %rdi -; X64-NEXT: andq %rbp, %rdi -; X64-NEXT: leaq (%rdi,%rbx,2), %r9 +; X64-NEXT: andq %r10, %r9 +; X64-NEXT: leaq (%r9,%r13,4), %r9 +; X64-NEXT: movq %r9, %r13 +; X64-NEXT: andq %r14, %r13 +; X64-NEXT: shrq %r9 +; X64-NEXT: andq %r14, %r9 +; X64-NEXT: leaq (%r9,%r13,2), %r9 ; X64-NEXT: shrdq $48, %r9, %r12 ; X64-NEXT: bswapq %r8 -; X64-NEXT: movq %r8, %rdi -; X64-NEXT: shrq $4, %rdi -; X64-NEXT: andq %r13, %rdi -; X64-NEXT: andq %r13, %r8 +; X64-NEXT: movq %r8, %r13 +; X64-NEXT: shrq $4, %r13 +; X64-NEXT: andq %r11, %r13 +; X64-NEXT: andq %r11, %r8 ; X64-NEXT: shlq $4, %r8 -; X64-NEXT: orq %rdi, %r8 -; X64-NEXT: movq %r8, %rdi -; X64-NEXT: andq %r11, %rdi +; X64-NEXT: orq %r13, %r8 +; X64-NEXT: movq %r8, %r13 +; X64-NEXT: andq %r10, %r13 ; X64-NEXT: shrq $2, %r8 -; X64-NEXT: andq %r11, %r8 -; X64-NEXT: leaq (%r8,%rdi,4), %rdi -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: andq %rbp, %rbx -; X64-NEXT: shrq %rdi -; X64-NEXT: andq %rbp, %rdi -; X64-NEXT: leaq (%rdi,%rbx,2), %rdi -; X64-NEXT: shrdq $48, %rdi, %r9 +; X64-NEXT: andq %r10, %r8 +; X64-NEXT: leaq (%r8,%r13,4), %r8 +; X64-NEXT: movq %r8, %r13 +; X64-NEXT: andq %r14, %r13 +; X64-NEXT: shrq %r8 +; X64-NEXT: andq %r14, %r8 +; X64-NEXT: leaq (%r8,%r13,2), %r8 +; X64-NEXT: shrdq $48, %r8, %r9 ; X64-NEXT: bswapq %rcx -; X64-NEXT: movq %rcx, %rbx -; X64-NEXT: shrq $4, %rbx -; X64-NEXT: andq %r13, %rbx -; X64-NEXT: andq %r13, %rcx +; X64-NEXT: movq %rcx, %r13 +; X64-NEXT: shrq $4, %r13 +; X64-NEXT: andq %r11, %r13 +; X64-NEXT: andq %r11, %rcx ; X64-NEXT: shlq $4, %rcx -; X64-NEXT: orq %rbx, %rcx -; X64-NEXT: movq %rcx, %rbx -; X64-NEXT: andq %r11, %rbx +; X64-NEXT: orq %r13, %rcx +; X64-NEXT: movq %rcx, %r13 +; X64-NEXT: andq %r10, %r13 ; X64-NEXT: shrq $2, %rcx -; X64-NEXT: andq %r11, %rcx -; X64-NEXT: leaq (%rcx,%rbx,4), %rcx -; X64-NEXT: movq %rcx, %rbx -; X64-NEXT: andq %rbp, %rbx +; X64-NEXT: andq %r10, %rcx +; X64-NEXT: leaq (%rcx,%r13,4), %rcx +; X64-NEXT: movq %rcx, %r13 +; X64-NEXT: andq %r14, %r13 ; X64-NEXT: shrq %rcx -; X64-NEXT: andq %rbp, %rcx -; X64-NEXT: leaq (%rcx,%rbx,2), %rcx -; X64-NEXT: shrdq $48, %rcx, %rdi +; X64-NEXT: andq %r14, %rcx +; X64-NEXT: leaq (%rcx,%r13,2), %rcx +; X64-NEXT: shrdq $48, %rcx, %r8 ; X64-NEXT: bswapq %rdx -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: shrq $4, %rbx -; X64-NEXT: andq %r13, %rbx -; X64-NEXT: andq %r13, %rdx +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: shrq $4, %r13 +; X64-NEXT: andq %r11, %r13 +; X64-NEXT: andq %r11, %rdx ; X64-NEXT: shlq $4, %rdx -; X64-NEXT: orq %rbx, %rdx -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: andq %r11, %rbx +; X64-NEXT: orq %r13, %rdx +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: andq %r10, %r13 ; X64-NEXT: shrq $2, %rdx -; X64-NEXT: andq %r11, %rdx -; X64-NEXT: leaq (%rdx,%rbx,4), %rdx -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: andq %rbp, %rbx +; X64-NEXT: andq %r10, %rdx +; X64-NEXT: leaq (%rdx,%r13,4), %rdx +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: andq %r14, %r13 ; X64-NEXT: shrq %rdx -; X64-NEXT: andq %rbp, %rdx -; X64-NEXT: leaq (%rdx,%rbx,2), %rdx +; X64-NEXT: andq %r14, %rdx +; X64-NEXT: leaq (%rdx,%r13,2), %rdx ; X64-NEXT: shrdq $48, %rdx, %rcx ; X64-NEXT: bswapq %rsi -; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: shrq $4, %rbx -; X64-NEXT: andq %r13, %rbx -; X64-NEXT: andq %r13, %rsi +; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: shrq $4, %r13 +; X64-NEXT: andq %r11, %r13 +; X64-NEXT: andq %r11, %rsi ; X64-NEXT: shlq $4, %rsi -; X64-NEXT: orq %rbx, %rsi -; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: andq %r11, %rbx +; X64-NEXT: orq %r13, %rsi +; X64-NEXT: movq %rsi, %r11 +; X64-NEXT: andq %r10, %r11 ; X64-NEXT: shrq $2, %rsi -; X64-NEXT: andq %r11, %rsi -; X64-NEXT: leaq (%rsi,%rbx,4), %rsi -; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: andq %rbp, %rbx +; X64-NEXT: andq %r10, %rsi +; X64-NEXT: leaq (%rsi,%r11,4), %rsi +; X64-NEXT: movq %rsi, %r10 +; X64-NEXT: andq %r14, %r10 ; X64-NEXT: shrq %rsi -; X64-NEXT: andq %rbp, %rsi -; X64-NEXT: leaq (%rsi,%rbx,2), %rsi +; X64-NEXT: andq %r14, %rsi +; X64-NEXT: leaq (%rsi,%r10,2), %rsi ; X64-NEXT: shrdq $48, %rsi, %rdx ; X64-NEXT: shrq $48, %rsi ; X64-NEXT: movq %rdx, 56(%rax) ; X64-NEXT: movq %rcx, 48(%rax) -; X64-NEXT: movq %rdi, 40(%rax) +; X64-NEXT: movq %r8, 40(%rax) ; X64-NEXT: movq %r9, 32(%rax) ; X64-NEXT: movq %r12, 24(%rax) ; X64-NEXT: movq %r15, 16(%rax) -; X64-NEXT: movq %r14, 8(%rax) -; X64-NEXT: movq %r10, (%rax) +; X64-NEXT: movq %rbx, 8(%rax) +; X64-NEXT: movq %rdi, (%rax) ; X64-NEXT: movw %si, 64(%rax) ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 ; X64-NEXT: popq %r14 ; X64-NEXT: popq %r15 -; X64-NEXT: popq %rbp ; X64-NEXT: retq ; ; X86XOP-LABEL: large_promotion: diff --git a/llvm/test/CodeGen/X86/break-false-dep.ll b/llvm/test/CodeGen/X86/break-false-dep.ll --- a/llvm/test/CodeGen/X86/break-false-dep.ll +++ b/llvm/test/CodeGen/X86/break-false-dep.ll @@ -309,22 +309,22 @@ ; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-WIN-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-WIN-NEXT: movq (%rcx), %rax -; SSE-WIN-NEXT: movl $1, %r8d +; SSE-WIN-NEXT: movl $1, %ecx ; SSE-WIN-NEXT: .p2align 4, 0x90 ; SSE-WIN-NEXT: .LBB7_1: # %loop ; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE-WIN-NEXT: xorps %xmm0, %xmm0 -; SSE-WIN-NEXT: cvtsi2sd %r8, %xmm0 +; SSE-WIN-NEXT: cvtsi2sd %rcx, %xmm0 ; SSE-WIN-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE-WIN-NEXT: #APP ; SSE-WIN-NEXT: #NO_APP ; SSE-WIN-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; SSE-WIN-NEXT: # xmm0 = mem[0],zero ; SSE-WIN-NEXT: addsd (%rdx), %xmm0 -; SSE-WIN-NEXT: cvttsd2si %xmm0, %rcx -; SSE-WIN-NEXT: addq %rcx, %rax -; SSE-WIN-NEXT: incq %r8 -; SSE-WIN-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; SSE-WIN-NEXT: cvttsd2si %xmm0, %r8 +; SSE-WIN-NEXT: addq %r8, %rax +; SSE-WIN-NEXT: incq %rcx +; SSE-WIN-NEXT: cmpq $156250000, %rcx # imm = 0x9502F90 ; SSE-WIN-NEXT: jne .LBB7_1 ; SSE-WIN-NEXT: # %bb.2: # %ret ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -354,22 +354,22 @@ ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: movq (%rcx), %rax -; AVX-NEXT: movl $1, %r8d +; AVX-NEXT: movl $1, %ecx ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB7_1: # %loop ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vcvtsi2sd %r8, %xmm1, %xmm0 +; AVX-NEXT: vcvtsi2sd %rcx, %xmm1, %xmm0 ; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX-NEXT: #APP ; AVX-NEXT: #NO_APP ; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; AVX-NEXT: # xmm0 = mem[0],zero ; AVX-NEXT: vaddsd (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vcvttsd2si %xmm0, %rcx -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: incq %r8 -; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; AVX-NEXT: vcvttsd2si %xmm0, %r8 +; AVX-NEXT: addq %r8, %rax +; AVX-NEXT: incq %rcx +; AVX-NEXT: cmpq $156250000, %rcx # imm = 0x9502F90 ; AVX-NEXT: jne .LBB7_1 ; AVX-NEXT: # %bb.2: # %ret ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -472,38 +472,38 @@ ; SSE-WIN-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill ; SSE-WIN-NEXT: .seh_savexmm %xmm6, 0 ; SSE-WIN-NEXT: .seh_endprologue -; SSE-WIN-NEXT: xorl %r9d, %r9d -; SSE-WIN-NEXT: leaq v(%rip), %r8 -; SSE-WIN-NEXT: leaq x(%rip), %r10 -; SSE-WIN-NEXT: leaq y(%rip), %r11 -; SSE-WIN-NEXT: leaq z(%rip), %rax -; SSE-WIN-NEXT: leaq w(%rip), %rdx +; SSE-WIN-NEXT: xorl %eax, %eax +; SSE-WIN-NEXT: leaq v(%rip), %rcx +; SSE-WIN-NEXT: leaq x(%rip), %rdx +; SSE-WIN-NEXT: leaq y(%rip), %r8 +; SSE-WIN-NEXT: leaq z(%rip), %r9 +; SSE-WIN-NEXT: leaq w(%rip), %r10 ; SSE-WIN-NEXT: .p2align 4, 0x90 ; SSE-WIN-NEXT: .LBB8_1: # %for.cond1.preheader ; SSE-WIN-NEXT: # =>This Loop Header: Depth=1 ; SSE-WIN-NEXT: # Child Loop BB8_2 Depth 2 -; SSE-WIN-NEXT: movq %r8, %rcx +; SSE-WIN-NEXT: movq %rcx, %r11 ; SSE-WIN-NEXT: xorl %esi, %esi ; SSE-WIN-NEXT: .p2align 4, 0x90 ; SSE-WIN-NEXT: .LBB8_2: # %for.body3 ; SSE-WIN-NEXT: # Parent Loop BB8_1 Depth=1 ; SSE-WIN-NEXT: # => This Inner Loop Header: Depth=2 ; SSE-WIN-NEXT: xorps %xmm0, %xmm0 -; SSE-WIN-NEXT: cvtsi2sdl (%rcx), %xmm0 -; SSE-WIN-NEXT: mulsd (%rsi,%r10), %xmm0 -; SSE-WIN-NEXT: mulsd (%rsi,%r11), %xmm0 -; SSE-WIN-NEXT: mulsd (%rsi,%rax), %xmm0 -; SSE-WIN-NEXT: movsd %xmm0, (%rsi,%rdx) +; SSE-WIN-NEXT: cvtsi2sdl (%r11), %xmm0 +; SSE-WIN-NEXT: mulsd (%rsi,%rdx), %xmm0 +; SSE-WIN-NEXT: mulsd (%rsi,%r8), %xmm0 +; SSE-WIN-NEXT: mulsd (%rsi,%r9), %xmm0 +; SSE-WIN-NEXT: movsd %xmm0, (%rsi,%r10) ; SSE-WIN-NEXT: #APP ; SSE-WIN-NEXT: #NO_APP ; SSE-WIN-NEXT: addq $8, %rsi -; SSE-WIN-NEXT: addq $4, %rcx +; SSE-WIN-NEXT: addq $4, %r11 ; SSE-WIN-NEXT: cmpq $8192, %rsi # imm = 0x2000 ; SSE-WIN-NEXT: jne .LBB8_2 ; SSE-WIN-NEXT: # %bb.3: # %for.inc14 ; SSE-WIN-NEXT: # in Loop: Header=BB8_1 Depth=1 -; SSE-WIN-NEXT: incl %r9d -; SSE-WIN-NEXT: cmpl $100000, %r9d # imm = 0x186A0 +; SSE-WIN-NEXT: incl %eax +; SSE-WIN-NEXT: cmpl $100000, %eax # imm = 0x186A0 ; SSE-WIN-NEXT: jne .LBB8_1 ; SSE-WIN-NEXT: # %bb.4: # %for.end16 ; SSE-WIN-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload @@ -548,38 +548,38 @@ ; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm6, 0 ; AVX-NEXT: .seh_endprologue -; AVX-NEXT: xorl %r9d, %r9d -; AVX-NEXT: leaq v(%rip), %r8 -; AVX-NEXT: leaq x(%rip), %r10 -; AVX-NEXT: leaq y(%rip), %r11 -; AVX-NEXT: leaq z(%rip), %rax -; AVX-NEXT: leaq w(%rip), %rdx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: leaq v(%rip), %rcx +; AVX-NEXT: leaq x(%rip), %rdx +; AVX-NEXT: leaq y(%rip), %r8 +; AVX-NEXT: leaq z(%rip), %r9 +; AVX-NEXT: leaq w(%rip), %r10 ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB8_1: # %for.cond1.preheader ; AVX-NEXT: # =>This Loop Header: Depth=1 ; AVX-NEXT: # Child Loop BB8_2 Depth 2 -; AVX-NEXT: movq %r8, %rcx +; AVX-NEXT: movq %rcx, %r11 ; AVX-NEXT: xorl %esi, %esi ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB8_2: # %for.body3 ; AVX-NEXT: # Parent Loop BB8_1 Depth=1 ; AVX-NEXT: # => This Inner Loop Header: Depth=2 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2sdl (%rcx), %xmm0, %xmm0 -; AVX-NEXT: vmulsd (%rsi,%r10), %xmm0, %xmm0 -; AVX-NEXT: vmulsd (%rsi,%r11), %xmm0, %xmm0 -; AVX-NEXT: vmulsd (%rsi,%rax), %xmm0, %xmm0 -; AVX-NEXT: vmovsd %xmm0, (%rsi,%rdx) +; AVX-NEXT: vcvtsi2sdl (%r11), %xmm0, %xmm0 +; AVX-NEXT: vmulsd (%rsi,%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmulsd (%rsi,%r8), %xmm0, %xmm0 +; AVX-NEXT: vmulsd (%rsi,%r9), %xmm0, %xmm0 +; AVX-NEXT: vmovsd %xmm0, (%rsi,%r10) ; AVX-NEXT: #APP ; AVX-NEXT: #NO_APP ; AVX-NEXT: addq $8, %rsi -; AVX-NEXT: addq $4, %rcx +; AVX-NEXT: addq $4, %r11 ; AVX-NEXT: cmpq $8192, %rsi # imm = 0x2000 ; AVX-NEXT: jne .LBB8_2 ; AVX-NEXT: # %bb.3: # %for.inc14 ; AVX-NEXT: # in Loop: Header=BB8_1 Depth=1 -; AVX-NEXT: incl %r9d -; AVX-NEXT: cmpl $100000, %r9d # imm = 0x186A0 +; AVX-NEXT: incl %eax +; AVX-NEXT: cmpl $100000, %eax # imm = 0x186A0 ; AVX-NEXT: jne .LBB8_1 ; AVX-NEXT: # %bb.4: # %for.end16 ; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload @@ -1154,12 +1154,12 @@ ; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-WIN-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill ; SSE-WIN-NEXT: movq (%rcx), %rax -; SSE-WIN-NEXT: movl $1, %r8d +; SSE-WIN-NEXT: movl $1, %ecx ; SSE-WIN-NEXT: .p2align 4, 0x90 ; SSE-WIN-NEXT: .LBB12_1: # %loop ; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE-WIN-NEXT: xorps %xmm4, %xmm4 -; SSE-WIN-NEXT: cvtsi2sd %r8, %xmm4 +; SSE-WIN-NEXT: cvtsi2sd %rcx, %xmm4 ; SSE-WIN-NEXT: #APP ; SSE-WIN-NEXT: #NO_APP ; SSE-WIN-NEXT: #APP @@ -1175,10 +1175,10 @@ ; SSE-WIN-NEXT: #APP ; SSE-WIN-NEXT: #NO_APP ; SSE-WIN-NEXT: addsd (%rdx), %xmm4 -; SSE-WIN-NEXT: cvttsd2si %xmm4, %rcx -; SSE-WIN-NEXT: addq %rcx, %rax -; SSE-WIN-NEXT: incq %r8 -; SSE-WIN-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; SSE-WIN-NEXT: cvttsd2si %xmm4, %r8 +; SSE-WIN-NEXT: addq %r8, %rax +; SSE-WIN-NEXT: incq %rcx +; SSE-WIN-NEXT: cmpq $156250000, %rcx # imm = 0x9502F90 ; SSE-WIN-NEXT: jne .LBB12_1 ; SSE-WIN-NEXT: # %bb.2: # %ret ; SSE-WIN-NEXT: movaps (%rsp), %xmm8 # 16-byte Reload @@ -1204,11 +1204,11 @@ ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill ; AVX-NEXT: movq (%rcx), %rax -; AVX-NEXT: movl $1, %r8d +; AVX-NEXT: movl $1, %ecx ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB12_1: # %loop ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: vcvtsi2sd %r8, %xmm5, %xmm4 +; AVX-NEXT: vcvtsi2sd %rcx, %xmm5, %xmm4 ; AVX-NEXT: #APP ; AVX-NEXT: #NO_APP ; AVX-NEXT: #APP @@ -1224,10 +1224,10 @@ ; AVX-NEXT: #APP ; AVX-NEXT: #NO_APP ; AVX-NEXT: vaddsd (%rdx), %xmm4, %xmm0 -; AVX-NEXT: vcvttsd2si %xmm0, %rcx -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: incq %r8 -; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; AVX-NEXT: vcvttsd2si %xmm0, %r8 +; AVX-NEXT: addq %r8, %rax +; AVX-NEXT: incq %rcx +; AVX-NEXT: cmpq $156250000, %rcx # imm = 0x9502F90 ; AVX-NEXT: jne .LBB12_1 ; AVX-NEXT: # %bb.2: # %ret ; AVX-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload @@ -1286,34 +1286,34 @@ ; SSE-LINUX-NEXT: #NO_APP ; SSE-LINUX-NEXT: #APP ; SSE-LINUX-NEXT: #NO_APP -; SSE-LINUX-NEXT: movl $1, %r8d +; SSE-LINUX-NEXT: movl $1, %eax ; SSE-LINUX-NEXT: xorl %ecx, %ecx ; SSE-LINUX-NEXT: .p2align 4, 0x90 ; SSE-LINUX-NEXT: .LBB13_1: # %inner_loop ; SSE-LINUX-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE-LINUX-NEXT: movq %rcx, %rax -; SSE-LINUX-NEXT: shrq $6, %rcx -; SSE-LINUX-NEXT: movq (%rsi,%rcx,8), %rcx -; SSE-LINUX-NEXT: btq %rax, %rcx -; SSE-LINUX-NEXT: leaq 1(%rax), %rcx +; SSE-LINUX-NEXT: movq %rcx, %r8 +; SSE-LINUX-NEXT: shrq $6, %r8 +; SSE-LINUX-NEXT: movq (%rsi,%r8,8), %r8 +; SSE-LINUX-NEXT: btq %rcx, %r8 +; SSE-LINUX-NEXT: leaq 1(%rcx), %rcx ; SSE-LINUX-NEXT: jae .LBB13_1 ; SSE-LINUX-NEXT: # %bb.2: # %loop_end ; SSE-LINUX-NEXT: # in Loop: Header=BB13_1 Depth=1 -; SSE-LINUX-NEXT: leaq 1(%r8), %r9 +; SSE-LINUX-NEXT: leaq 1(%rax), %r8 ; SSE-LINUX-NEXT: xorps %xmm4, %xmm4 -; SSE-LINUX-NEXT: cvtsi2sd %r9, %xmm4 +; SSE-LINUX-NEXT: cvtsi2sd %r8, %xmm4 ; SSE-LINUX-NEXT: movapd %xmm0, %xmm5 ; SSE-LINUX-NEXT: subsd %xmm4, %xmm5 ; SSE-LINUX-NEXT: mulsd %xmm1, %xmm5 -; SSE-LINUX-NEXT: leaq -1(%rcx), %rax +; SSE-LINUX-NEXT: leaq -1(%rcx), %r9 ; SSE-LINUX-NEXT: xorps %xmm4, %xmm4 -; SSE-LINUX-NEXT: cvtsi2sd %rax, %xmm4 +; SSE-LINUX-NEXT: cvtsi2sd %r9, %xmm4 ; SSE-LINUX-NEXT: mulsd %xmm2, %xmm4 ; SSE-LINUX-NEXT: addsd %xmm5, %xmm4 ; SSE-LINUX-NEXT: divsd %xmm3, %xmm4 -; SSE-LINUX-NEXT: movsd %xmm4, -8(%rdi,%r8,8) -; SSE-LINUX-NEXT: movq %r9, %r8 -; SSE-LINUX-NEXT: cmpq %r9, %rdx +; SSE-LINUX-NEXT: movsd %xmm4, -8(%rdi,%rax,8) +; SSE-LINUX-NEXT: movq %r8, %rax +; SSE-LINUX-NEXT: cmpq %r8, %rdx ; SSE-LINUX-NEXT: jge .LBB13_1 ; SSE-LINUX-NEXT: # %bb.3: # %loopdone ; SSE-LINUX-NEXT: retq @@ -1341,7 +1341,7 @@ ; SSE-WIN-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill ; SSE-WIN-NEXT: .seh_savexmm %xmm7, 0 ; SSE-WIN-NEXT: .seh_endprologue -; SSE-WIN-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE-WIN-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-WIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE-WIN-NEXT: #APP @@ -1358,35 +1358,34 @@ ; SSE-WIN-NEXT: #NO_APP ; SSE-WIN-NEXT: #APP ; SSE-WIN-NEXT: #NO_APP -; SSE-WIN-NEXT: movl $1, %r9d -; SSE-WIN-NEXT: xorl %r11d, %r11d +; SSE-WIN-NEXT: movl $1, %r8d +; SSE-WIN-NEXT: xorl %r9d, %r9d ; SSE-WIN-NEXT: .p2align 4, 0x90 ; SSE-WIN-NEXT: .LBB13_1: # %inner_loop ; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE-WIN-NEXT: movq %r11, %r10 -; SSE-WIN-NEXT: movq %r11, %rax -; SSE-WIN-NEXT: shrq $6, %rax -; SSE-WIN-NEXT: movq (%rdx,%rax,8), %rax -; SSE-WIN-NEXT: btq %r11, %rax -; SSE-WIN-NEXT: leaq 1(%r11), %r11 +; SSE-WIN-NEXT: movq %r9, %r10 +; SSE-WIN-NEXT: shrq $6, %r10 +; SSE-WIN-NEXT: movq (%rdx,%r10,8), %r10 +; SSE-WIN-NEXT: btq %r9, %r10 +; SSE-WIN-NEXT: leaq 1(%r9), %r9 ; SSE-WIN-NEXT: jae .LBB13_1 ; SSE-WIN-NEXT: # %bb.2: # %loop_end ; SSE-WIN-NEXT: # in Loop: Header=BB13_1 Depth=1 -; SSE-WIN-NEXT: leaq 1(%r9), %r10 +; SSE-WIN-NEXT: leaq 1(%r8), %r10 ; SSE-WIN-NEXT: xorps %xmm4, %xmm4 ; SSE-WIN-NEXT: cvtsi2sd %r10, %xmm4 ; SSE-WIN-NEXT: movapd %xmm2, %xmm5 ; SSE-WIN-NEXT: subsd %xmm4, %xmm5 ; SSE-WIN-NEXT: mulsd %xmm3, %xmm5 -; SSE-WIN-NEXT: leaq -1(%r11), %rax +; SSE-WIN-NEXT: leaq -1(%r9), %r11 ; SSE-WIN-NEXT: xorps %xmm4, %xmm4 -; SSE-WIN-NEXT: cvtsi2sd %rax, %xmm4 +; SSE-WIN-NEXT: cvtsi2sd %r11, %xmm4 ; SSE-WIN-NEXT: mulsd %xmm1, %xmm4 ; SSE-WIN-NEXT: addsd %xmm5, %xmm4 ; SSE-WIN-NEXT: divsd %xmm0, %xmm4 -; SSE-WIN-NEXT: movsd %xmm4, -8(%rcx,%r9,8) -; SSE-WIN-NEXT: movq %r10, %r9 -; SSE-WIN-NEXT: cmpq %r10, %r8 +; SSE-WIN-NEXT: movsd %xmm4, -8(%rcx,%r8,8) +; SSE-WIN-NEXT: movq %r10, %r8 +; SSE-WIN-NEXT: cmpq %r10, %rax ; SSE-WIN-NEXT: jge .LBB13_1 ; SSE-WIN-NEXT: # %bb.3: # %loopdone ; SSE-WIN-NEXT: movaps (%rsp), %xmm7 # 16-byte Reload @@ -1425,7 +1424,7 @@ ; AVX1-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill ; AVX1-NEXT: .seh_savexmm %xmm7, 0 ; AVX1-NEXT: .seh_endprologue -; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX1-NEXT: #APP @@ -1442,32 +1441,31 @@ ; AVX1-NEXT: #NO_APP ; AVX1-NEXT: #APP ; AVX1-NEXT: #NO_APP -; AVX1-NEXT: movl $1, %r9d -; AVX1-NEXT: xorl %r11d, %r11d +; AVX1-NEXT: movl $1, %r8d +; AVX1-NEXT: xorl %r9d, %r9d ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB13_1: # %inner_loop ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: movq %r11, %r10 -; AVX1-NEXT: movq %r11, %rax -; AVX1-NEXT: shrq $6, %rax -; AVX1-NEXT: movq (%rdx,%rax,8), %rax -; AVX1-NEXT: btq %r11, %rax -; AVX1-NEXT: leaq 1(%r11), %r11 +; AVX1-NEXT: movq %r9, %r10 +; AVX1-NEXT: shrq $6, %r10 +; AVX1-NEXT: movq (%rdx,%r10,8), %r10 +; AVX1-NEXT: btq %r9, %r10 +; AVX1-NEXT: leaq 1(%r9), %r9 ; AVX1-NEXT: jae .LBB13_1 ; AVX1-NEXT: # %bb.2: # %loop_end ; AVX1-NEXT: # in Loop: Header=BB13_1 Depth=1 -; AVX1-NEXT: leaq 1(%r9), %r10 +; AVX1-NEXT: leaq 1(%r8), %r10 ; AVX1-NEXT: vcvtsi2sd %r10, %xmm6, %xmm4 ; AVX1-NEXT: vsubsd %xmm4, %xmm2, %xmm4 ; AVX1-NEXT: vmulsd %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: leaq -1(%r11), %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm6, %xmm5 +; AVX1-NEXT: leaq -1(%r9), %r11 +; AVX1-NEXT: vcvtsi2sd %r11, %xmm6, %xmm5 ; AVX1-NEXT: vmulsd %xmm1, %xmm5, %xmm5 ; AVX1-NEXT: vaddsd %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vdivsd %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vmovsd %xmm4, -8(%rcx,%r9,8) -; AVX1-NEXT: movq %r10, %r9 -; AVX1-NEXT: cmpq %r10, %r8 +; AVX1-NEXT: vmovsd %xmm4, -8(%rcx,%r8,8) +; AVX1-NEXT: movq %r10, %r8 +; AVX1-NEXT: cmpq %r10, %rax ; AVX1-NEXT: jge .LBB13_1 ; AVX1-NEXT: # %bb.3: # %loopdone ; AVX1-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload @@ -1507,7 +1505,7 @@ ; AVX512VL-NEXT: .seh_savexmm %xmm7, 0 ; AVX512VL-NEXT: .seh_endprologue ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512VL-NEXT: #APP ; AVX512VL-NEXT: #NO_APP @@ -1523,32 +1521,31 @@ ; AVX512VL-NEXT: #NO_APP ; AVX512VL-NEXT: #APP ; AVX512VL-NEXT: #NO_APP -; AVX512VL-NEXT: movl $1, %r9d -; AVX512VL-NEXT: xorl %r11d, %r11d +; AVX512VL-NEXT: movl $1, %r8d +; AVX512VL-NEXT: xorl %r9d, %r9d ; AVX512VL-NEXT: .p2align 4, 0x90 ; AVX512VL-NEXT: .LBB13_1: # %inner_loop ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512VL-NEXT: movq %r11, %r10 -; AVX512VL-NEXT: movq %r11, %rax -; AVX512VL-NEXT: shrq $6, %rax -; AVX512VL-NEXT: movq (%rdx,%rax,8), %rax -; AVX512VL-NEXT: btq %r11, %rax -; AVX512VL-NEXT: leaq 1(%r11), %r11 +; AVX512VL-NEXT: movq %r9, %r10 +; AVX512VL-NEXT: shrq $6, %r10 +; AVX512VL-NEXT: movq (%rdx,%r10,8), %r10 +; AVX512VL-NEXT: btq %r9, %r10 +; AVX512VL-NEXT: leaq 1(%r9), %r9 ; AVX512VL-NEXT: jae .LBB13_1 ; AVX512VL-NEXT: # %bb.2: # %loop_end ; AVX512VL-NEXT: # in Loop: Header=BB13_1 Depth=1 -; AVX512VL-NEXT: leaq 1(%r9), %r10 +; AVX512VL-NEXT: leaq 1(%r8), %r10 ; AVX512VL-NEXT: vcvtsi2sd %r10, %xmm6, %xmm4 ; AVX512VL-NEXT: vsubsd %xmm4, %xmm2, %xmm4 ; AVX512VL-NEXT: vmulsd %xmm3, %xmm4, %xmm4 -; AVX512VL-NEXT: leaq -1(%r11), %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm6, %xmm5 +; AVX512VL-NEXT: leaq -1(%r9), %r11 +; AVX512VL-NEXT: vcvtsi2sd %r11, %xmm6, %xmm5 ; AVX512VL-NEXT: vmulsd %xmm1, %xmm5, %xmm5 ; AVX512VL-NEXT: vaddsd %xmm5, %xmm4, %xmm4 ; AVX512VL-NEXT: vdivsd %xmm0, %xmm4, %xmm4 -; AVX512VL-NEXT: vmovsd %xmm4, -8(%rcx,%r9,8) -; AVX512VL-NEXT: movq %r10, %r9 -; AVX512VL-NEXT: cmpq %r10, %r8 +; AVX512VL-NEXT: vmovsd %xmm4, -8(%rcx,%r8,8) +; AVX512VL-NEXT: movq %r10, %r8 +; AVX512VL-NEXT: cmpq %r10, %rax ; AVX512VL-NEXT: jge .LBB13_1 ; AVX512VL-NEXT: # %bb.3: # %loopdone ; AVX512VL-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll --- a/llvm/test/CodeGen/X86/bswap.ll +++ b/llvm/test/CodeGen/X86/bswap.ll @@ -355,13 +355,13 @@ ; CHECK64-NEXT: movq %rdi, %rax ; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; CHECK64-NEXT: bswapq %r10 +; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; CHECK64-NEXT: bswapq %rdi -; CHECK64-NEXT: shrdq $48, %rdi, %r10 +; CHECK64-NEXT: bswapq %r10 +; CHECK64-NEXT: shrdq $48, %r10, %rdi ; CHECK64-NEXT: bswapq %r11 -; CHECK64-NEXT: shrdq $48, %r11, %rdi +; CHECK64-NEXT: shrdq $48, %r11, %r10 ; CHECK64-NEXT: bswapq %rbx ; CHECK64-NEXT: shrdq $48, %rbx, %r11 ; CHECK64-NEXT: bswapq %r9 @@ -381,8 +381,8 @@ ; CHECK64-NEXT: movq %r9, 32(%rax) ; CHECK64-NEXT: movq %rbx, 24(%rax) ; CHECK64-NEXT: movq %r11, 16(%rax) -; CHECK64-NEXT: movq %rdi, 8(%rax) -; CHECK64-NEXT: movq %r10, (%rax) +; CHECK64-NEXT: movq %r10, 8(%rax) +; CHECK64-NEXT: movq %rdi, (%rax) ; CHECK64-NEXT: movw %si, 64(%rax) ; CHECK64-NEXT: popq %rbx ; CHECK64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll b/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll --- a/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll +++ b/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll @@ -18,34 +18,34 @@ ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movabsq $-2305847407260205056, %rbx # imm = 0xDFFFFC0000000000 +; CHECK-NEXT: movabsq $-2305847407260205056, %r14 # imm = 0xDFFFFC0000000000 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB0_5 ; CHECK-NEXT: # %bb.1: # %bb5 -; CHECK-NEXT: movq %rsi, %r14 +; CHECK-NEXT: movq %rsi, %rbx ; CHECK-NEXT: movslq %edi, %rbp ; CHECK-NEXT: leaq (,%rbp,8), %rax -; CHECK-NEXT: leaq global(%rax,%rax,2), %r15 -; CHECK-NEXT: leaq global+4(%rax,%rax,2), %r12 +; CHECK-NEXT: leaq global(%rax,%rax,2), %r14 +; CHECK-NEXT: leaq global+4(%rax,%rax,2), %r15 ; CHECK-NEXT: xorl %r13d, %r13d ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_2: # %bb8 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: callq bar@PLT -; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: movq %rax, %r12 ; CHECK-NEXT: movq %rax, %rdi -; CHECK-NEXT: callq *%r14 -; CHECK-NEXT: movq %r15, %rdi +; CHECK-NEXT: callq *%rbx +; CHECK-NEXT: movq %r14, %rdi ; CHECK-NEXT: callq hoge@PLT -; CHECK-NEXT: movq %r12, %rdi +; CHECK-NEXT: movq %r15, %rdi ; CHECK-NEXT: callq hoge@PLT ; CHECK-NEXT: testb %r13b, %r13b ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.3: # %bb15 ; CHECK-NEXT: leaq (%rbp,%rbp,2), %rax -; CHECK-NEXT: movq %rbx, global+16(,%rax,8) -; CHECK-NEXT: movabsq $-2305847407260205056, %rbx # imm = 0xDFFFFC0000000000 +; CHECK-NEXT: movq %r12, global+16(,%rax,8) +; CHECK-NEXT: movabsq $-2305847407260205056, %r14 # imm = 0xDFFFFC0000000000 ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: # %bb.4: # %bb17 @@ -53,7 +53,7 @@ ; CHECK-NEXT: .LBB0_5: # Block address taken ; CHECK-NEXT: # %bb18 ; CHECK-NEXT: # Label of block must be emitted -; CHECK-NEXT: movw $0, 14(%rbx) +; CHECK-NEXT: movw $0, 14(%r14) ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll b/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll --- a/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll +++ b/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll @@ -16,42 +16,40 @@ ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movl %edx, %ebx +; CHECK-NEXT: movl %edx, %ebp ; CHECK-NEXT: movl %esi, %r12d -; CHECK-NEXT: movq %rdi, %r15 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: callq c -; CHECK-NEXT: movl %eax, %r13d -; CHECK-NEXT: movq %r15, %rdi +; CHECK-NEXT: movl %eax, %r14d +; CHECK-NEXT: movq %rbx, %rdi ; CHECK-NEXT: callq l ; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: jne .LBB0_9 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: cmpl $0, e(%rip) -; CHECK-NEXT: # implicit-def: $ebx -; CHECK-NEXT: # implicit-def: $r14d +; CHECK-NEXT: # implicit-def: $r15d +; CHECK-NEXT: # implicit-def: $r13d ; CHECK-NEXT: je .LBB0_4 ; CHECK-NEXT: # %bb.2: # %if.then4 ; CHECK-NEXT: movslq %r12d, %rdi ; CHECK-NEXT: callq m -; CHECK-NEXT: # implicit-def: $ebx -; CHECK-NEXT: # implicit-def: $ebp +; CHECK-NEXT: # implicit-def: $r15d +; CHECK-NEXT: # implicit-def: $r12d ; CHECK-NEXT: .LBB0_3: # %r ; CHECK-NEXT: callq c -; CHECK-NEXT: movl %ebp, %r14d +; CHECK-NEXT: movl %r12d, %r13d ; CHECK-NEXT: .LBB0_4: # %if.end8 -; CHECK-NEXT: movl %ebx, %edi +; CHECK-NEXT: movl %r15d, %edi ; CHECK-NEXT: callq i -; CHECK-NEXT: movl %eax, %ebp -; CHECK-NEXT: orl %r14d, %ebp -; CHECK-NEXT: andl $4, %ebx -; CHECK-NEXT: testl %r13d, %r13d +; CHECK-NEXT: movl %eax, %r12d +; CHECK-NEXT: orl %r13d, %r12d +; CHECK-NEXT: andl $4, %r15d +; CHECK-NEXT: testl %r14d, %r14d ; CHECK-NEXT: jne .LBB0_3 ; CHECK-NEXT: # %bb.5: # %if.end12 -; CHECK-NEXT: testl %ebp, %ebp +; CHECK-NEXT: testl %r12d, %r12d ; CHECK-NEXT: je .LBB0_8 ; CHECK-NEXT: # %bb.6: # %if.then14 -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: jmp .LBB0_9 @@ -59,9 +57,9 @@ ; CHECK-NEXT: # %if.then20.critedge ; CHECK-NEXT: # Label of block must be emitted ; CHECK-NEXT: movl j(%rip), %edi -; CHECK-NEXT: movslq %eax, %rcx +; CHECK-NEXT: movslq %ebp, %rcx ; CHECK-NEXT: movl $1, %esi -; CHECK-NEXT: movq %r15, %rdx +; CHECK-NEXT: movq %rbx, %rdx ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/callbr-asm-phi-placement.ll b/llvm/test/CodeGen/X86/callbr-asm-phi-placement.ll --- a/llvm/test/CodeGen/X86/callbr-asm-phi-placement.ll +++ b/llvm/test/CodeGen/X86/callbr-asm-phi-placement.ll @@ -15,14 +15,14 @@ ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movq %rsi, %r14 +; CHECK-NEXT: movq %rsi, %rbx ; CHECK-NEXT: .LBB0_1: # Block address taken ; CHECK-NEXT: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: # Label of block must be emitted -; CHECK-NEXT: movq (%r14), %rbx +; CHECK-NEXT: movq (%rbx), %r14 ; CHECK-NEXT: callq foo@PLT -; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: movq %r14, %rdi ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: # %bb.2: # %end diff --git a/llvm/test/CodeGen/X86/cgp-usubo.ll b/llvm/test/CodeGen/X86/cgp-usubo.ll --- a/llvm/test/CodeGen/X86/cgp-usubo.ll +++ b/llvm/test/CodeGen/X86/cgp-usubo.ll @@ -171,18 +171,18 @@ ; CHECK-NEXT: testb $1, %bpl ; CHECK-NEXT: je .LBB9_2 ; CHECK-NEXT: # %bb.1: # %t -; CHECK-NEXT: movq %rdx, %r14 +; CHECK-NEXT: movq %rdx, %rbx ; CHECK-NEXT: movq %rsi, %r15 -; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movq %rdi, %r14 ; CHECK-NEXT: xorl %edi, %edi -; CHECK-NEXT: cmpq %rsi, %rbx +; CHECK-NEXT: cmpq %rsi, %r14 ; CHECK-NEXT: setb %dil ; CHECK-NEXT: callq call@PLT -; CHECK-NEXT: subq %r15, %rbx +; CHECK-NEXT: subq %r15, %r14 ; CHECK-NEXT: jae .LBB9_2 ; CHECK-NEXT: # %bb.4: # %end ; CHECK-NEXT: setb %al -; CHECK-NEXT: movq %rbx, (%r14) +; CHECK-NEXT: movq %r14, (%rbx) ; CHECK-NEXT: jmp .LBB9_3 ; CHECK-NEXT: .LBB9_2: # %f ; CHECK-NEXT: movl %ebp, %eax diff --git a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll --- a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -693,52 +693,52 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rbx ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm1, %r10 -; SSE2-NEXT: movq %r10, %r8 -; SSE2-NEXT: shrq $56, %r8 +; SSE2-NEXT: movq %xmm1, %rdx +; SSE2-NEXT: movq %rdx, %rax +; SSE2-NEXT: shrq $56, %rax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movq %rdx, %rcx +; SSE2-NEXT: shrq $48, %rcx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movq %rdx, %rsi +; SSE2-NEXT: shrq $40, %rsi +; SSE2-NEXT: andl $15, %esi +; SSE2-NEXT: movq %rdx, %r8 +; SSE2-NEXT: shrq $32, %r8 ; SSE2-NEXT: andl $15, %r8d +; SSE2-NEXT: movq %xmm0, %r10 +; SSE2-NEXT: movq %r10, %rdi +; SSE2-NEXT: shrq $56, %rdi +; SSE2-NEXT: andl $15, %edi ; SSE2-NEXT: movq %r10, %r9 ; SSE2-NEXT: shrq $48, %r9 ; SSE2-NEXT: andl $15, %r9d -; SSE2-NEXT: movq %r10, %rsi -; SSE2-NEXT: shrq $40, %rsi -; SSE2-NEXT: andl $15, %esi ; SSE2-NEXT: movq %r10, %r11 -; SSE2-NEXT: shrq $32, %r11 +; SSE2-NEXT: shrq $40, %r11 ; SSE2-NEXT: andl $15, %r11d -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %rax, %rdx -; SSE2-NEXT: shrq $56, %rdx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq $48, %rcx -; SSE2-NEXT: andl $15, %ecx -; SSE2-NEXT: movq %rax, %rdi -; SSE2-NEXT: shrq $40, %rdi -; SSE2-NEXT: andl $15, %edi -; SSE2-NEXT: movq %rax, %rbx +; SSE2-NEXT: movq %r10, %rbx ; SSE2-NEXT: shrq $32, %rbx ; SSE2-NEXT: andl $15, %ebx ; SSE2-NEXT: shlq $32, %rbx -; SSE2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; SSE2-NEXT: orq %rbx, %rax -; SSE2-NEXT: shlq $40, %rdi -; SSE2-NEXT: orq %rax, %rdi -; SSE2-NEXT: shlq $48, %rcx -; SSE2-NEXT: orq %rdi, %rcx -; SSE2-NEXT: shlq $56, %rdx -; SSE2-NEXT: orq %rcx, %rdx -; SSE2-NEXT: shlq $32, %r11 ; SSE2-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F -; SSE2-NEXT: orq %r11, %r10 -; SSE2-NEXT: shlq $40, %rsi -; SSE2-NEXT: orq %r10, %rsi +; SSE2-NEXT: orq %rbx, %r10 +; SSE2-NEXT: shlq $40, %r11 +; SSE2-NEXT: orq %r10, %r11 ; SSE2-NEXT: shlq $48, %r9 -; SSE2-NEXT: orq %rsi, %r9 -; SSE2-NEXT: shlq $56, %r8 -; SSE2-NEXT: orq %r9, %r8 -; SSE2-NEXT: movq %rdx, %xmm0 -; SSE2-NEXT: movq %r8, %xmm1 +; SSE2-NEXT: orq %r11, %r9 +; SSE2-NEXT: shlq $56, %rdi +; SSE2-NEXT: orq %r9, %rdi +; SSE2-NEXT: shlq $32, %r8 +; SSE2-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; SSE2-NEXT: orq %r8, %rdx +; SSE2-NEXT: shlq $40, %rsi +; SSE2-NEXT: orq %rdx, %rsi +; SSE2-NEXT: shlq $48, %rcx +; SSE2-NEXT: orq %rsi, %rcx +; SSE2-NEXT: shlq $56, %rax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: movq %rdi, %xmm0 +; SSE2-NEXT: movq %rax, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: retq @@ -746,52 +746,52 @@ ; SSE42-LABEL: _clearupper16xi8b: ; SSE42: # %bb.0: ; SSE42-NEXT: pushq %rbx -; SSE42-NEXT: pextrq $1, %xmm0, %r10 -; SSE42-NEXT: movq %r10, %r8 -; SSE42-NEXT: shrq $56, %r8 +; SSE42-NEXT: pextrq $1, %xmm0, %rdx +; SSE42-NEXT: movq %rdx, %rax +; SSE42-NEXT: shrq $56, %rax +; SSE42-NEXT: andl $15, %eax +; SSE42-NEXT: movq %rdx, %rcx +; SSE42-NEXT: shrq $48, %rcx +; SSE42-NEXT: andl $15, %ecx +; SSE42-NEXT: movq %rdx, %rsi +; SSE42-NEXT: shrq $40, %rsi +; SSE42-NEXT: andl $15, %esi +; SSE42-NEXT: movq %rdx, %r8 +; SSE42-NEXT: shrq $32, %r8 ; SSE42-NEXT: andl $15, %r8d +; SSE42-NEXT: movq %xmm0, %r10 +; SSE42-NEXT: movq %r10, %rdi +; SSE42-NEXT: shrq $56, %rdi +; SSE42-NEXT: andl $15, %edi ; SSE42-NEXT: movq %r10, %r9 ; SSE42-NEXT: shrq $48, %r9 ; SSE42-NEXT: andl $15, %r9d -; SSE42-NEXT: movq %r10, %rsi -; SSE42-NEXT: shrq $40, %rsi -; SSE42-NEXT: andl $15, %esi ; SSE42-NEXT: movq %r10, %r11 -; SSE42-NEXT: shrq $32, %r11 +; SSE42-NEXT: shrq $40, %r11 ; SSE42-NEXT: andl $15, %r11d -; SSE42-NEXT: movq %xmm0, %rax -; SSE42-NEXT: movq %rax, %rdx -; SSE42-NEXT: shrq $56, %rdx -; SSE42-NEXT: andl $15, %edx -; SSE42-NEXT: movq %rax, %rcx -; SSE42-NEXT: shrq $48, %rcx -; SSE42-NEXT: andl $15, %ecx -; SSE42-NEXT: movq %rax, %rdi -; SSE42-NEXT: shrq $40, %rdi -; SSE42-NEXT: andl $15, %edi -; SSE42-NEXT: movq %rax, %rbx +; SSE42-NEXT: movq %r10, %rbx ; SSE42-NEXT: shrq $32, %rbx ; SSE42-NEXT: andl $15, %ebx ; SSE42-NEXT: shlq $32, %rbx -; SSE42-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; SSE42-NEXT: orq %rbx, %rax -; SSE42-NEXT: shlq $40, %rdi -; SSE42-NEXT: orq %rax, %rdi -; SSE42-NEXT: shlq $48, %rcx -; SSE42-NEXT: orq %rdi, %rcx -; SSE42-NEXT: shlq $56, %rdx -; SSE42-NEXT: orq %rcx, %rdx -; SSE42-NEXT: shlq $32, %r11 ; SSE42-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F -; SSE42-NEXT: orq %r11, %r10 -; SSE42-NEXT: shlq $40, %rsi -; SSE42-NEXT: orq %r10, %rsi +; SSE42-NEXT: orq %rbx, %r10 +; SSE42-NEXT: shlq $40, %r11 +; SSE42-NEXT: orq %r10, %r11 ; SSE42-NEXT: shlq $48, %r9 -; SSE42-NEXT: orq %rsi, %r9 -; SSE42-NEXT: shlq $56, %r8 -; SSE42-NEXT: orq %r9, %r8 -; SSE42-NEXT: movq %r8, %xmm1 -; SSE42-NEXT: movq %rdx, %xmm0 +; SSE42-NEXT: orq %r11, %r9 +; SSE42-NEXT: shlq $56, %rdi +; SSE42-NEXT: orq %r9, %rdi +; SSE42-NEXT: shlq $32, %r8 +; SSE42-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; SSE42-NEXT: orq %r8, %rdx +; SSE42-NEXT: shlq $40, %rsi +; SSE42-NEXT: orq %rdx, %rsi +; SSE42-NEXT: shlq $48, %rcx +; SSE42-NEXT: orq %rsi, %rcx +; SSE42-NEXT: shlq $56, %rax +; SSE42-NEXT: orq %rcx, %rax +; SSE42-NEXT: movq %rax, %xmm1 +; SSE42-NEXT: movq %rdi, %xmm0 ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE42-NEXT: popq %rbx ; SSE42-NEXT: retq @@ -800,52 +800,52 @@ ; AVX: # %bb.0: ; AVX-NEXT: pushq %rbx ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %r9 ; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; AVX-NEXT: movq %r9, %r8 -; AVX-NEXT: shrq $56, %r8 -; AVX-NEXT: andl $15, %r8d -; AVX-NEXT: movq %r9, %r10 -; AVX-NEXT: shrq $48, %r10 -; AVX-NEXT: andl $15, %r10d -; AVX-NEXT: movq %r9, %rsi -; AVX-NEXT: shrq $40, %rsi -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: movq %r9, %r11 -; AVX-NEXT: shrq $32, %r11 -; AVX-NEXT: andl $15, %r11d -; AVX-NEXT: movq %rdx, %rdi -; AVX-NEXT: shrq $56, %rdi -; AVX-NEXT: andl $15, %edi +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rdi ; AVX-NEXT: movq %rdx, %rax -; AVX-NEXT: shrq $48, %rax +; AVX-NEXT: shrq $56, %rax ; AVX-NEXT: andl $15, %eax ; AVX-NEXT: movq %rdx, %rcx -; AVX-NEXT: shrq $40, %rcx +; AVX-NEXT: shrq $48, %rcx ; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: movq %rdx, %rsi +; AVX-NEXT: shrq $40, %rsi +; AVX-NEXT: andl $15, %esi +; AVX-NEXT: movq %rdx, %r8 +; AVX-NEXT: shrq $32, %r8 +; AVX-NEXT: andl $15, %r8d +; AVX-NEXT: movq %rdi, %r9 +; AVX-NEXT: shrq $56, %r9 +; AVX-NEXT: andl $15, %r9d +; AVX-NEXT: movq %rdi, %r10 +; AVX-NEXT: shrq $48, %r10 +; AVX-NEXT: andl $15, %r10d +; AVX-NEXT: movq %rdi, %r11 +; AVX-NEXT: shrq $40, %r11 +; AVX-NEXT: andl $15, %r11d +; AVX-NEXT: movq %rdi, %rbx ; AVX-NEXT: shrq $32, %rbx ; AVX-NEXT: andl $15, %ebx ; AVX-NEXT: shlq $32, %rbx +; AVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F +; AVX-NEXT: orq %rbx, %rdi +; AVX-NEXT: shlq $40, %r11 +; AVX-NEXT: orq %rdi, %r11 +; AVX-NEXT: shlq $48, %r10 +; AVX-NEXT: orq %r11, %r10 +; AVX-NEXT: shlq $56, %r9 +; AVX-NEXT: orq %r10, %r9 +; AVX-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX-NEXT: shlq $32, %r8 ; AVX-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F -; AVX-NEXT: orq %rbx, %rdx -; AVX-NEXT: shlq $40, %rcx -; AVX-NEXT: orq %rdx, %rcx -; AVX-NEXT: shlq $48, %rax -; AVX-NEXT: orq %rcx, %rax -; AVX-NEXT: shlq $56, %rdi -; AVX-NEXT: orq %rax, %rdi -; AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; AVX-NEXT: shlq $32, %r11 -; AVX-NEXT: andl $252645135, %r9d # imm = 0xF0F0F0F -; AVX-NEXT: orq %r11, %r9 +; AVX-NEXT: orq %r8, %rdx ; AVX-NEXT: shlq $40, %rsi -; AVX-NEXT: orq %r9, %rsi -; AVX-NEXT: shlq $48, %r10 -; AVX-NEXT: orq %rsi, %r10 -; AVX-NEXT: shlq $56, %r8 -; AVX-NEXT: orq %r10, %r8 -; AVX-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX-NEXT: orq %rdx, %rsi +; AVX-NEXT: shlq $48, %rcx +; AVX-NEXT: orq %rsi, %rcx +; AVX-NEXT: shlq $56, %rax +; AVX-NEXT: orq %rcx, %rax +; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: popq %rbx ; AVX-NEXT: retq @@ -875,52 +875,52 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rbx ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %r10 -; SSE2-NEXT: movq %r10, %r8 -; SSE2-NEXT: shrq $56, %r8 +; SSE2-NEXT: movq %xmm2, %rdx +; SSE2-NEXT: movq %rdx, %rax +; SSE2-NEXT: shrq $56, %rax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movq %rdx, %rcx +; SSE2-NEXT: shrq $48, %rcx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movq %rdx, %rsi +; SSE2-NEXT: shrq $40, %rsi +; SSE2-NEXT: andl $15, %esi +; SSE2-NEXT: movq %rdx, %r8 +; SSE2-NEXT: shrq $32, %r8 ; SSE2-NEXT: andl $15, %r8d +; SSE2-NEXT: movq %xmm0, %r10 +; SSE2-NEXT: movq %r10, %rdi +; SSE2-NEXT: shrq $56, %rdi +; SSE2-NEXT: andl $15, %edi ; SSE2-NEXT: movq %r10, %r9 ; SSE2-NEXT: shrq $48, %r9 ; SSE2-NEXT: andl $15, %r9d -; SSE2-NEXT: movq %r10, %rsi -; SSE2-NEXT: shrq $40, %rsi -; SSE2-NEXT: andl $15, %esi ; SSE2-NEXT: movq %r10, %r11 -; SSE2-NEXT: shrq $32, %r11 +; SSE2-NEXT: shrq $40, %r11 ; SSE2-NEXT: andl $15, %r11d -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %rax, %rdx -; SSE2-NEXT: shrq $56, %rdx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq $48, %rcx -; SSE2-NEXT: andl $15, %ecx -; SSE2-NEXT: movq %rax, %rdi -; SSE2-NEXT: shrq $40, %rdi -; SSE2-NEXT: andl $15, %edi -; SSE2-NEXT: movq %rax, %rbx +; SSE2-NEXT: movq %r10, %rbx ; SSE2-NEXT: shrq $32, %rbx ; SSE2-NEXT: andl $15, %ebx ; SSE2-NEXT: shlq $32, %rbx -; SSE2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; SSE2-NEXT: orq %rbx, %rax -; SSE2-NEXT: shlq $40, %rdi -; SSE2-NEXT: orq %rax, %rdi -; SSE2-NEXT: shlq $48, %rcx -; SSE2-NEXT: orq %rdi, %rcx -; SSE2-NEXT: shlq $56, %rdx -; SSE2-NEXT: orq %rcx, %rdx -; SSE2-NEXT: shlq $32, %r11 ; SSE2-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F -; SSE2-NEXT: orq %r11, %r10 -; SSE2-NEXT: shlq $40, %rsi -; SSE2-NEXT: orq %r10, %rsi +; SSE2-NEXT: orq %rbx, %r10 +; SSE2-NEXT: shlq $40, %r11 +; SSE2-NEXT: orq %r10, %r11 ; SSE2-NEXT: shlq $48, %r9 -; SSE2-NEXT: orq %rsi, %r9 -; SSE2-NEXT: shlq $56, %r8 -; SSE2-NEXT: orq %r9, %r8 -; SSE2-NEXT: movq %rdx, %xmm0 -; SSE2-NEXT: movq %r8, %xmm2 +; SSE2-NEXT: orq %r11, %r9 +; SSE2-NEXT: shlq $56, %rdi +; SSE2-NEXT: orq %r9, %rdi +; SSE2-NEXT: shlq $32, %r8 +; SSE2-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; SSE2-NEXT: orq %r8, %rdx +; SSE2-NEXT: shlq $40, %rsi +; SSE2-NEXT: orq %rdx, %rsi +; SSE2-NEXT: shlq $48, %rcx +; SSE2-NEXT: orq %rsi, %rcx +; SSE2-NEXT: shlq $56, %rax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: movq %rdi, %xmm0 +; SSE2-NEXT: movq %rax, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: retq @@ -928,52 +928,52 @@ ; SSE42-LABEL: _clearupper32xi8b: ; SSE42: # %bb.0: ; SSE42-NEXT: pushq %rbx -; SSE42-NEXT: pextrq $1, %xmm0, %r10 -; SSE42-NEXT: movq %r10, %r8 -; SSE42-NEXT: shrq $56, %r8 +; SSE42-NEXT: pextrq $1, %xmm0, %rdx +; SSE42-NEXT: movq %rdx, %rax +; SSE42-NEXT: shrq $56, %rax +; SSE42-NEXT: andl $15, %eax +; SSE42-NEXT: movq %rdx, %rcx +; SSE42-NEXT: shrq $48, %rcx +; SSE42-NEXT: andl $15, %ecx +; SSE42-NEXT: movq %rdx, %rsi +; SSE42-NEXT: shrq $40, %rsi +; SSE42-NEXT: andl $15, %esi +; SSE42-NEXT: movq %rdx, %r8 +; SSE42-NEXT: shrq $32, %r8 ; SSE42-NEXT: andl $15, %r8d +; SSE42-NEXT: movq %xmm0, %r10 +; SSE42-NEXT: movq %r10, %rdi +; SSE42-NEXT: shrq $56, %rdi +; SSE42-NEXT: andl $15, %edi ; SSE42-NEXT: movq %r10, %r9 ; SSE42-NEXT: shrq $48, %r9 ; SSE42-NEXT: andl $15, %r9d -; SSE42-NEXT: movq %r10, %rsi -; SSE42-NEXT: shrq $40, %rsi -; SSE42-NEXT: andl $15, %esi ; SSE42-NEXT: movq %r10, %r11 -; SSE42-NEXT: shrq $32, %r11 +; SSE42-NEXT: shrq $40, %r11 ; SSE42-NEXT: andl $15, %r11d -; SSE42-NEXT: movq %xmm0, %rax -; SSE42-NEXT: movq %rax, %rdx -; SSE42-NEXT: shrq $56, %rdx -; SSE42-NEXT: andl $15, %edx -; SSE42-NEXT: movq %rax, %rcx -; SSE42-NEXT: shrq $48, %rcx -; SSE42-NEXT: andl $15, %ecx -; SSE42-NEXT: movq %rax, %rdi -; SSE42-NEXT: shrq $40, %rdi -; SSE42-NEXT: andl $15, %edi -; SSE42-NEXT: movq %rax, %rbx +; SSE42-NEXT: movq %r10, %rbx ; SSE42-NEXT: shrq $32, %rbx ; SSE42-NEXT: andl $15, %ebx ; SSE42-NEXT: shlq $32, %rbx -; SSE42-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; SSE42-NEXT: orq %rbx, %rax -; SSE42-NEXT: shlq $40, %rdi -; SSE42-NEXT: orq %rax, %rdi -; SSE42-NEXT: shlq $48, %rcx -; SSE42-NEXT: orq %rdi, %rcx -; SSE42-NEXT: shlq $56, %rdx -; SSE42-NEXT: orq %rcx, %rdx -; SSE42-NEXT: shlq $32, %r11 ; SSE42-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F -; SSE42-NEXT: orq %r11, %r10 -; SSE42-NEXT: shlq $40, %rsi -; SSE42-NEXT: orq %r10, %rsi +; SSE42-NEXT: orq %rbx, %r10 +; SSE42-NEXT: shlq $40, %r11 +; SSE42-NEXT: orq %r10, %r11 ; SSE42-NEXT: shlq $48, %r9 -; SSE42-NEXT: orq %rsi, %r9 -; SSE42-NEXT: shlq $56, %r8 -; SSE42-NEXT: orq %r9, %r8 -; SSE42-NEXT: movq %r8, %xmm2 -; SSE42-NEXT: movq %rdx, %xmm0 +; SSE42-NEXT: orq %r11, %r9 +; SSE42-NEXT: shlq $56, %rdi +; SSE42-NEXT: orq %r9, %rdi +; SSE42-NEXT: shlq $32, %r8 +; SSE42-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; SSE42-NEXT: orq %r8, %rdx +; SSE42-NEXT: shlq $40, %rsi +; SSE42-NEXT: orq %rdx, %rsi +; SSE42-NEXT: shlq $48, %rcx +; SSE42-NEXT: orq %rsi, %rcx +; SSE42-NEXT: shlq $56, %rax +; SSE42-NEXT: orq %rcx, %rax +; SSE42-NEXT: movq %rax, %xmm2 +; SSE42-NEXT: movq %rdi, %xmm0 ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE42-NEXT: popq %rbx ; SSE42-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -476,11 +476,11 @@ ; SSE: # %bb.0: # %start ; SSE-NEXT: movslq %esi, %rax ; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrlq $32, %xmm3 ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: .p2align 4, 0x90 ; SSE-NEXT: .LBB8_1: # %loop @@ -489,44 +489,44 @@ ; SSE-NEXT: pmovsxdq 2097168(%rdi,%rax), %xmm4 ; SSE-NEXT: pmovsxdq 2097152(%rdi,%rax), %xmm6 ; SSE-NEXT: pmovsxdq 2097160(%rdi,%rax), %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pmuludq %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pmuludq %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pmuludq %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pmuludq %xmm7, %xmm9 ; SSE-NEXT: psrlq $32, %xmm7 -; SSE-NEXT: pmuludq %xmm9, %xmm7 -; SSE-NEXT: paddq %xmm3, %xmm7 +; SSE-NEXT: pmuludq %xmm2, %xmm7 +; SSE-NEXT: paddq %xmm8, %xmm7 ; SSE-NEXT: psllq $32, %xmm7 -; SSE-NEXT: paddq %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pmuludq %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pmuludq %xmm6, %xmm3 +; SSE-NEXT: paddq %xmm9, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pmuludq %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pmuludq %xmm6, %xmm9 ; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm9, %xmm6 -; SSE-NEXT: paddq %xmm2, %xmm6 +; SSE-NEXT: pmuludq %xmm2, %xmm6 +; SSE-NEXT: paddq %xmm8, %xmm6 ; SSE-NEXT: psllq $32, %xmm6 -; SSE-NEXT: paddq %xmm3, %xmm6 +; SSE-NEXT: paddq %xmm9, %xmm6 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm7[1,3] ; SSE-NEXT: paddd %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pmuludq %xmm4, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm9, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pmuludq %xmm5, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm9, %xmm5 -; SSE-NEXT: paddq %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: psrlq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pmuludq %xmm4, %xmm7 +; SSE-NEXT: paddq %xmm6, %xmm7 +; SSE-NEXT: psllq $32, %xmm7 +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: paddq %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: psrlq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pmuludq %xmm5, %xmm7 +; SSE-NEXT: paddq %xmm6, %xmm7 +; SSE-NEXT: psllq $32, %xmm7 +; SSE-NEXT: pmuludq %xmm2, %xmm5 +; SSE-NEXT: paddq %xmm7, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm5[1,3] ; SSE-NEXT: paddd %xmm4, %xmm1 ; SSE-NEXT: subq $-128, %rax diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -551,8 +551,8 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psraw $15, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = -; SSE2-NEXT: pmulhuw %xmm8, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = +; SSE2-NEXT: pmulhuw %xmm7, %xmm0 ; SSE2-NEXT: paddw %xmm3, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,0,0,65535] ; SSE2-NEXT: movdqa %xmm0, %xmm2 @@ -568,40 +568,40 @@ ; SSE2-NEXT: movdqa %xmm5, %xmm2 ; SSE2-NEXT: pandn %xmm6, %xmm2 ; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,0,0,65535,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,0,65535,0] ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: psraw $1, %xmm2 -; SSE2-NEXT: movdqa %xmm7, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pandn %xmm2, %xmm8 +; SSE2-NEXT: por %xmm0, %xmm8 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm2, %xmm6 +; SSE2-NEXT: pand %xmm2, %xmm8 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psraw $15, %xmm3 -; SSE2-NEXT: pmulhuw %xmm8, %xmm3 +; SSE2-NEXT: pmulhuw %xmm7, %xmm3 ; SSE2-NEXT: paddw %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: psraw $4, %xmm3 ; SSE2-NEXT: pandn %xmm3, %xmm4 -; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: por %xmm7, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: psraw $2, %xmm4 ; SSE2-NEXT: pandn %xmm4, %xmm5 ; SSE2-NEXT: por %xmm3, %xmm5 ; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: psraw $1, %xmm5 -; SSE2-NEXT: pandn %xmm5, %xmm7 -; SSE2-NEXT: por %xmm3, %xmm7 -; SSE2-NEXT: pand %xmm2, %xmm7 +; SSE2-NEXT: pandn %xmm5, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: pand %xmm2, %xmm6 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; @@ -709,114 +709,114 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v32i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psraw $15, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = ; SSE2-NEXT: pmulhuw %xmm9, %xmm0 ; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,0,0,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,65535,0,0,65535] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: psraw $4, %xmm0 -; SSE2-NEXT: movdqa %xmm11, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pandn %xmm0, %xmm8 +; SSE2-NEXT: por %xmm4, %xmm8 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,0,65535] -; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm8, %xmm0 ; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: psraw $2, %xmm5 +; SSE2-NEXT: psraw $2, %xmm8 ; SSE2-NEXT: movdqa %xmm7, %xmm4 -; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm8, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,0,0,65535,0] ; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: psraw $1, %xmm4 -; SSE2-NEXT: movdqa %xmm10, %xmm5 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm12, %xmm5 -; SSE2-NEXT: movdqa %xmm12, %xmm0 +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: pandn %xmm4, %xmm10 +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm4, %xmm10 +; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: por %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 ; SSE2-NEXT: psraw $15, %xmm1 ; SSE2-NEXT: pmulhuw %xmm9, %xmm1 -; SSE2-NEXT: paddw %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pand %xmm11, %xmm5 +; SSE2-NEXT: paddw %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: pand %xmm6, %xmm10 ; SSE2-NEXT: psraw $4, %xmm1 -; SSE2-NEXT: movdqa %xmm11, %xmm6 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm11 +; SSE2-NEXT: pandn %xmm1, %xmm11 +; SSE2-NEXT: por %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm1 ; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: psraw $2, %xmm6 -; SSE2-NEXT: movdqa %xmm7, %xmm5 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm10, %xmm1 -; SSE2-NEXT: psraw $1, %xmm5 -; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: pandn %xmm5, %xmm6 -; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: pand %xmm12, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: psraw $2, %xmm11 +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: pandn %xmm11, %xmm10 +; SSE2-NEXT: por %xmm1, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: psraw $1, %xmm10 +; SSE2-NEXT: movdqa %xmm8, %xmm11 +; SSE2-NEXT: pandn %xmm10, %xmm11 +; SSE2-NEXT: por %xmm1, %xmm11 +; SSE2-NEXT: pand %xmm4, %xmm11 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm1 +; SSE2-NEXT: por %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: psraw $15, %xmm5 ; SSE2-NEXT: pmulhuw %xmm9, %xmm5 ; SSE2-NEXT: paddw %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pand %xmm11, %xmm6 +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: pand %xmm6, %xmm10 ; SSE2-NEXT: psraw $4, %xmm5 -; SSE2-NEXT: movdqa %xmm11, %xmm4 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm11 +; SSE2-NEXT: pandn %xmm5, %xmm11 +; SSE2-NEXT: por %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm5 ; SSE2-NEXT: pand %xmm7, %xmm5 -; SSE2-NEXT: psraw $2, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm6 -; SSE2-NEXT: pandn %xmm4, %xmm6 -; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm10, %xmm4 -; SSE2-NEXT: psraw $1, %xmm6 +; SSE2-NEXT: psraw $2, %xmm11 +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: pandn %xmm11, %xmm10 +; SSE2-NEXT: por %xmm5, %xmm10 ; SSE2-NEXT: movdqa %xmm10, %xmm5 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm12, %xmm5 -; SSE2-NEXT: movdqa %xmm12, %xmm8 -; SSE2-NEXT: pandn %xmm2, %xmm8 -; SSE2-NEXT: por %xmm5, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: psraw $1, %xmm10 +; SSE2-NEXT: movdqa %xmm8, %xmm11 +; SSE2-NEXT: pandn %xmm10, %xmm11 +; SSE2-NEXT: por %xmm5, %xmm11 +; SSE2-NEXT: pand %xmm4, %xmm11 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: por %xmm11, %xmm5 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: psraw $15, %xmm2 ; SSE2-NEXT: pmulhuw %xmm9, %xmm2 ; SSE2-NEXT: paddw %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: pand %xmm6, %xmm9 ; SSE2-NEXT: psraw $4, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm11 -; SSE2-NEXT: por %xmm4, %xmm11 -; SSE2-NEXT: movdqa %xmm11, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm9, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm2 ; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: psraw $2, %xmm11 -; SSE2-NEXT: pandn %xmm11, %xmm7 +; SSE2-NEXT: psraw $2, %xmm6 +; SSE2-NEXT: pandn %xmm6, %xmm7 ; SSE2-NEXT: por %xmm2, %xmm7 ; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm10, %xmm2 +; SSE2-NEXT: pand %xmm8, %xmm2 ; SSE2-NEXT: psraw $1, %xmm7 -; SSE2-NEXT: pandn %xmm7, %xmm10 -; SSE2-NEXT: por %xmm2, %xmm10 -; SSE2-NEXT: pand %xmm12, %xmm10 -; SSE2-NEXT: pandn %xmm3, %xmm12 -; SSE2-NEXT: por %xmm10, %xmm12 -; SSE2-NEXT: movdqa %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm12, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm2, %xmm8 +; SSE2-NEXT: pand %xmm4, %xmm8 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16: diff --git a/llvm/test/CodeGen/X86/commute-fcmp.ll b/llvm/test/CodeGen/X86/commute-fcmp.ll --- a/llvm/test/CodeGen/X86/commute-fcmp.ll +++ b/llvm/test/CodeGen/X86/commute-fcmp.ll @@ -896,14 +896,14 @@ define <16 x i32> @commute_cmpps_ueq_zmm(ptr %a0, <16 x float> %a1) { ; SSE-LABEL: commute_cmpps_ueq_zmm: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm5 ; SSE-NEXT: movaps 32(%rdi), %xmm6 -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm7, %xmm4 -; SSE-NEXT: cmpeqps %xmm0, %xmm4 -; SSE-NEXT: cmpunordps %xmm7, %xmm0 -; SSE-NEXT: orps %xmm4, %xmm0 +; SSE-NEXT: movaps 48(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: cmpeqps %xmm0, %xmm8 +; SSE-NEXT: cmpunordps %xmm4, %xmm0 +; SSE-NEXT: orps %xmm8, %xmm0 ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: cmpeqps %xmm1, %xmm4 ; SSE-NEXT: cmpunordps %xmm5, %xmm1 @@ -912,9 +912,9 @@ ; SSE-NEXT: cmpeqps %xmm2, %xmm4 ; SSE-NEXT: cmpunordps %xmm6, %xmm2 ; SSE-NEXT: orps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: cmpeqps %xmm3, %xmm4 -; SSE-NEXT: cmpunordps %xmm8, %xmm3 +; SSE-NEXT: cmpunordps %xmm7, %xmm3 ; SSE-NEXT: orps %xmm4, %xmm3 ; SSE-NEXT: retq ; @@ -938,14 +938,14 @@ define <16 x i32> @commute_cmpps_one_zmm(ptr %a0, <16 x float> %a1) { ; SSE-LABEL: commute_cmpps_one_zmm: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm5 ; SSE-NEXT: movaps 32(%rdi), %xmm6 -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm7, %xmm4 -; SSE-NEXT: cmpneqps %xmm0, %xmm4 -; SSE-NEXT: cmpordps %xmm7, %xmm0 -; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: movaps 48(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: cmpneqps %xmm0, %xmm8 +; SSE-NEXT: cmpordps %xmm4, %xmm0 +; SSE-NEXT: andps %xmm8, %xmm0 ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: cmpneqps %xmm1, %xmm4 ; SSE-NEXT: cmpordps %xmm5, %xmm1 @@ -954,9 +954,9 @@ ; SSE-NEXT: cmpneqps %xmm2, %xmm4 ; SSE-NEXT: cmpordps %xmm6, %xmm2 ; SSE-NEXT: andps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: cmpneqps %xmm3, %xmm4 -; SSE-NEXT: cmpordps %xmm8, %xmm3 +; SSE-NEXT: cmpordps %xmm7, %xmm3 ; SSE-NEXT: andps %xmm4, %xmm3 ; SSE-NEXT: retq ; @@ -1156,14 +1156,14 @@ define <8 x i64> @commute_cmppd_ueq_zmmm(ptr %a0, <8 x double> %a1) { ; SSE-LABEL: commute_cmppd_ueq_zmmm: ; SSE: # %bb.0: -; SSE-NEXT: movapd (%rdi), %xmm7 +; SSE-NEXT: movapd (%rdi), %xmm4 ; SSE-NEXT: movapd 16(%rdi), %xmm5 ; SSE-NEXT: movapd 32(%rdi), %xmm6 -; SSE-NEXT: movapd 48(%rdi), %xmm8 -; SSE-NEXT: movapd %xmm7, %xmm4 -; SSE-NEXT: cmpeqpd %xmm0, %xmm4 -; SSE-NEXT: cmpunordpd %xmm7, %xmm0 -; SSE-NEXT: orpd %xmm4, %xmm0 +; SSE-NEXT: movapd 48(%rdi), %xmm7 +; SSE-NEXT: movapd %xmm4, %xmm8 +; SSE-NEXT: cmpeqpd %xmm0, %xmm8 +; SSE-NEXT: cmpunordpd %xmm4, %xmm0 +; SSE-NEXT: orpd %xmm8, %xmm0 ; SSE-NEXT: movapd %xmm5, %xmm4 ; SSE-NEXT: cmpeqpd %xmm1, %xmm4 ; SSE-NEXT: cmpunordpd %xmm5, %xmm1 @@ -1172,9 +1172,9 @@ ; SSE-NEXT: cmpeqpd %xmm2, %xmm4 ; SSE-NEXT: cmpunordpd %xmm6, %xmm2 ; SSE-NEXT: orpd %xmm4, %xmm2 -; SSE-NEXT: movapd %xmm8, %xmm4 +; SSE-NEXT: movapd %xmm7, %xmm4 ; SSE-NEXT: cmpeqpd %xmm3, %xmm4 -; SSE-NEXT: cmpunordpd %xmm8, %xmm3 +; SSE-NEXT: cmpunordpd %xmm7, %xmm3 ; SSE-NEXT: orpd %xmm4, %xmm3 ; SSE-NEXT: retq ; @@ -1198,14 +1198,14 @@ define <8 x i64> @commute_cmppd_one_zmmm(ptr %a0, <8 x double> %a1) { ; SSE-LABEL: commute_cmppd_one_zmmm: ; SSE: # %bb.0: -; SSE-NEXT: movapd (%rdi), %xmm7 +; SSE-NEXT: movapd (%rdi), %xmm4 ; SSE-NEXT: movapd 16(%rdi), %xmm5 ; SSE-NEXT: movapd 32(%rdi), %xmm6 -; SSE-NEXT: movapd 48(%rdi), %xmm8 -; SSE-NEXT: movapd %xmm7, %xmm4 -; SSE-NEXT: cmpneqpd %xmm0, %xmm4 -; SSE-NEXT: cmpordpd %xmm7, %xmm0 -; SSE-NEXT: andpd %xmm4, %xmm0 +; SSE-NEXT: movapd 48(%rdi), %xmm7 +; SSE-NEXT: movapd %xmm4, %xmm8 +; SSE-NEXT: cmpneqpd %xmm0, %xmm8 +; SSE-NEXT: cmpordpd %xmm4, %xmm0 +; SSE-NEXT: andpd %xmm8, %xmm0 ; SSE-NEXT: movapd %xmm5, %xmm4 ; SSE-NEXT: cmpneqpd %xmm1, %xmm4 ; SSE-NEXT: cmpordpd %xmm5, %xmm1 @@ -1214,9 +1214,9 @@ ; SSE-NEXT: cmpneqpd %xmm2, %xmm4 ; SSE-NEXT: cmpordpd %xmm6, %xmm2 ; SSE-NEXT: andpd %xmm4, %xmm2 -; SSE-NEXT: movapd %xmm8, %xmm4 +; SSE-NEXT: movapd %xmm7, %xmm4 ; SSE-NEXT: cmpneqpd %xmm3, %xmm4 -; SSE-NEXT: cmpordpd %xmm8, %xmm3 +; SSE-NEXT: cmpordpd %xmm7, %xmm3 ; SSE-NEXT: andpd %xmm4, %xmm3 ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/compact-unwind.ll b/llvm/test/CodeGen/X86/compact-unwind.ll --- a/llvm/test/CodeGen/X86/compact-unwind.ll +++ b/llvm/test/CodeGen/X86/compact-unwind.ll @@ -65,12 +65,12 @@ ; NOFP-CU: Entry at offset 0x20: ; NOFP-CU-NEXT: start: 0x1d _test1 -; NOFP-CU-NEXT: length: 0x42 +; NOFP-CU-NEXT: length: 0x44 ; NOFP-CU-NEXT: compact encoding: 0x02040c0a ; NOFP-FROM-ASM: Entry at offset 0x20: ; NOFP-FROM-ASM-NEXT: start: 0x1d _test1 -; NOFP-FROM-ASM-NEXT: length: 0x42 +; NOFP-FROM-ASM-NEXT: length: 0x44 ; NOFP-FROM-ASM-NEXT: compact encoding: 0x02040c0a define void @test1(ptr %image) optsize ssp uwtable { diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll --- a/llvm/test/CodeGen/X86/conditional-tailcall.ll +++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll @@ -362,14 +362,14 @@ ; CHECK64-NEXT: movq (%rdi), %rdi # encoding: [0x48,0x8b,0x3f] ; CHECK64-NEXT: movq -24(%rdi), %rax # encoding: [0x48,0x8b,0x47,0xe8] ; CHECK64-NEXT: leaq (%rdi,%rax), %rsi # encoding: [0x48,0x8d,0x34,0x07] -; CHECK64-NEXT: xorl %ecx, %ecx # encoding: [0x31,0xc9] +; CHECK64-NEXT: xorl %r8d, %r8d # encoding: [0x45,0x31,0xc0] ; CHECK64-NEXT: pushq $2 # encoding: [0x6a,0x02] ; CHECK64-NEXT: .cfi_adjust_cfa_offset 8 -; CHECK64-NEXT: popq %r9 # encoding: [0x41,0x59] +; CHECK64-NEXT: popq %rcx # encoding: [0x59] ; CHECK64-NEXT: .cfi_adjust_cfa_offset -8 ; CHECK64-NEXT: pushq $1 # encoding: [0x6a,0x01] ; CHECK64-NEXT: .cfi_adjust_cfa_offset 8 -; CHECK64-NEXT: popq %r8 # encoding: [0x41,0x58] +; CHECK64-NEXT: popq %rdx # encoding: [0x5a] ; CHECK64-NEXT: .cfi_adjust_cfa_offset -8 ; CHECK64-NEXT: .LBB3_1: # %for.cond ; CHECK64-NEXT: # =>This Inner Loop Header: Depth=1 @@ -378,56 +378,56 @@ ; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_12-1, kind: FK_PCRel_1 ; CHECK64-NEXT: # %bb.2: # %for.body ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK64-NEXT: cmpl $2, %ecx # encoding: [0x83,0xf9,0x02] +; CHECK64-NEXT: cmpl $2, %r8d # encoding: [0x41,0x83,0xf8,0x02] ; CHECK64-NEXT: je .LBB3_10 # encoding: [0x74,A] ; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 ; CHECK64-NEXT: # %bb.3: # %for.body ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK64-NEXT: cmpl $1, %ecx # encoding: [0x83,0xf9,0x01] +; CHECK64-NEXT: cmpl $1, %r8d # encoding: [0x41,0x83,0xf8,0x01] ; CHECK64-NEXT: je .LBB3_8 # encoding: [0x74,A] ; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_8-1, kind: FK_PCRel_1 ; CHECK64-NEXT: # %bb.4: # %for.body ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK64-NEXT: testl %ecx, %ecx # encoding: [0x85,0xc9] +; CHECK64-NEXT: testl %r8d, %r8d # encoding: [0x45,0x85,0xc0] ; CHECK64-NEXT: jne .LBB3_11 # encoding: [0x75,A] ; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 ; CHECK64-NEXT: # %bb.5: # %sw.bb ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK64-NEXT: movzbl (%rdi), %edx # encoding: [0x0f,0xb6,0x17] -; CHECK64-NEXT: cmpl $43, %edx # encoding: [0x83,0xfa,0x2b] -; CHECK64-NEXT: movl %r8d, %ecx # encoding: [0x44,0x89,0xc1] +; CHECK64-NEXT: movzbl (%rdi), %r9d # encoding: [0x44,0x0f,0xb6,0x0f] +; CHECK64-NEXT: cmpl $43, %r9d # encoding: [0x41,0x83,0xf9,0x2b] +; CHECK64-NEXT: movl %edx, %r8d # encoding: [0x41,0x89,0xd0] ; CHECK64-NEXT: je .LBB3_11 # encoding: [0x74,A] ; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 ; CHECK64-NEXT: # %bb.6: # %sw.bb ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK64-NEXT: cmpl $45, %edx # encoding: [0x83,0xfa,0x2d] -; CHECK64-NEXT: movl %r8d, %ecx # encoding: [0x44,0x89,0xc1] +; CHECK64-NEXT: cmpl $45, %r9d # encoding: [0x41,0x83,0xf9,0x2d] +; CHECK64-NEXT: movl %edx, %r8d # encoding: [0x41,0x89,0xd0] ; CHECK64-NEXT: je .LBB3_11 # encoding: [0x74,A] ; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 ; CHECK64-NEXT: # %bb.7: # %if.else ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK64-NEXT: addl $-48, %edx # encoding: [0x83,0xc2,0xd0] -; CHECK64-NEXT: cmpl $10, %edx # encoding: [0x83,0xfa,0x0a] +; CHECK64-NEXT: addl $-48, %r9d # encoding: [0x41,0x83,0xc1,0xd0] +; CHECK64-NEXT: cmpl $10, %r9d # encoding: [0x41,0x83,0xf9,0x0a] ; CHECK64-NEXT: jmp .LBB3_9 # encoding: [0xeb,A] ; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1 ; CHECK64-NEXT: .LBB3_8: # %sw.bb14 ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK64-NEXT: movzbl (%rdi), %ecx # encoding: [0x0f,0xb6,0x0f] -; CHECK64-NEXT: addl $-48, %ecx # encoding: [0x83,0xc1,0xd0] -; CHECK64-NEXT: cmpl $10, %ecx # encoding: [0x83,0xf9,0x0a] +; CHECK64-NEXT: movzbl (%rdi), %r8d # encoding: [0x44,0x0f,0xb6,0x07] +; CHECK64-NEXT: addl $-48, %r8d # encoding: [0x41,0x83,0xc0,0xd0] +; CHECK64-NEXT: cmpl $10, %r8d # encoding: [0x41,0x83,0xf8,0x0a] ; CHECK64-NEXT: .LBB3_9: # %if.else ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK64-NEXT: movl %r9d, %ecx # encoding: [0x44,0x89,0xc9] +; CHECK64-NEXT: movl %ecx, %r8d # encoding: [0x41,0x89,0xc8] ; CHECK64-NEXT: jb .LBB3_11 # encoding: [0x72,A] ; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 ; CHECK64-NEXT: jmp .LBB3_13 # encoding: [0xeb,A] ; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_13-1, kind: FK_PCRel_1 ; CHECK64-NEXT: .LBB3_10: # %sw.bb22 ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK64-NEXT: movzbl (%rdi), %ecx # encoding: [0x0f,0xb6,0x0f] -; CHECK64-NEXT: addl $-48, %ecx # encoding: [0x83,0xc1,0xd0] -; CHECK64-NEXT: cmpl $10, %ecx # encoding: [0x83,0xf9,0x0a] -; CHECK64-NEXT: movl %r9d, %ecx # encoding: [0x44,0x89,0xc9] +; CHECK64-NEXT: movzbl (%rdi), %r8d # encoding: [0x44,0x0f,0xb6,0x07] +; CHECK64-NEXT: addl $-48, %r8d # encoding: [0x41,0x83,0xc0,0xd0] +; CHECK64-NEXT: cmpl $10, %r8d # encoding: [0x41,0x83,0xf8,0x0a] +; CHECK64-NEXT: movl %ecx, %r8d # encoding: [0x41,0x89,0xc8] ; CHECK64-NEXT: jae _Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEES3_ # TAILCALL ; CHECK64-NEXT: # encoding: [0x73,A] ; CHECK64-NEXT: # fixup A - offset: 1, value: _Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEES3_-1, kind: FK_PCRel_1 @@ -438,7 +438,7 @@ ; CHECK64-NEXT: jmp .LBB3_1 # encoding: [0xeb,A] ; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_1-1, kind: FK_PCRel_1 ; CHECK64-NEXT: .LBB3_12: -; CHECK64-NEXT: cmpl $2, %ecx # encoding: [0x83,0xf9,0x02] +; CHECK64-NEXT: cmpl $2, %r8d # encoding: [0x41,0x83,0xf8,0x02] ; CHECK64-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; CHECK64-NEXT: # kill: def $al killed $al killed $eax ; CHECK64-NEXT: retq # encoding: [0xc3] @@ -450,34 +450,34 @@ ; WIN64-LABEL: pr31257: ; WIN64: # %bb.0: # %entry ; WIN64-NEXT: movq (%rcx), %rcx # encoding: [0x48,0x8b,0x09] -; WIN64-NEXT: movq -24(%rcx), %r8 # encoding: [0x4c,0x8b,0x41,0xe8] -; WIN64-NEXT: leaq (%rcx,%r8), %rdx # encoding: [0x4a,0x8d,0x14,0x01] -; WIN64-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; WIN64-NEXT: movq -24(%rcx), %rax # encoding: [0x48,0x8b,0x41,0xe8] +; WIN64-NEXT: leaq (%rcx,%rax), %rdx # encoding: [0x48,0x8d,0x14,0x01] +; WIN64-NEXT: xorl %r8d, %r8d # encoding: [0x45,0x31,0xc0] ; WIN64-NEXT: .LBB3_1: # %for.cond ; WIN64-NEXT: # =>This Inner Loop Header: Depth=1 -; WIN64-NEXT: testq %r8, %r8 # encoding: [0x4d,0x85,0xc0] +; WIN64-NEXT: testq %rax, %rax # encoding: [0x48,0x85,0xc0] ; WIN64-NEXT: je .LBB3_11 # encoding: [0x74,A] ; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 ; WIN64-NEXT: # %bb.2: # %for.body ; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; WIN64-NEXT: cmpl $2, %eax # encoding: [0x83,0xf8,0x02] +; WIN64-NEXT: cmpl $2, %r8d # encoding: [0x41,0x83,0xf8,0x02] ; WIN64-NEXT: je .LBB3_9 # encoding: [0x74,A] ; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1 ; WIN64-NEXT: # %bb.3: # %for.body ; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; WIN64-NEXT: cmpl $1, %eax # encoding: [0x83,0xf8,0x01] +; WIN64-NEXT: cmpl $1, %r8d # encoding: [0x41,0x83,0xf8,0x01] ; WIN64-NEXT: je .LBB3_7 # encoding: [0x74,A] ; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_7-1, kind: FK_PCRel_1 ; WIN64-NEXT: # %bb.4: # %for.body ; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; WIN64-NEXT: testl %eax, %eax # encoding: [0x85,0xc0] +; WIN64-NEXT: testl %r8d, %r8d # encoding: [0x45,0x85,0xc0] ; WIN64-NEXT: jne .LBB3_10 # encoding: [0x75,A] ; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 ; WIN64-NEXT: # %bb.5: # %sw.bb ; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; WIN64-NEXT: movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09] ; WIN64-NEXT: cmpl $43, %r9d # encoding: [0x41,0x83,0xf9,0x2b] -; WIN64-NEXT: movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00] +; WIN64-NEXT: movl $1, %r8d # encoding: [0x41,0xb8,0x01,0x00,0x00,0x00] ; WIN64-NEXT: je .LBB3_10 # encoding: [0x74,A] ; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 ; WIN64-NEXT: # %bb.6: # %sw.bb @@ -493,7 +493,7 @@ ; WIN64-NEXT: .LBB3_8: # %if.else ; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; WIN64-NEXT: addl $-48, %r9d # encoding: [0x41,0x83,0xc1,0xd0] -; WIN64-NEXT: movl $2, %eax # encoding: [0xb8,0x02,0x00,0x00,0x00] +; WIN64-NEXT: movl $2, %r8d # encoding: [0x41,0xb8,0x02,0x00,0x00,0x00] ; WIN64-NEXT: cmpl $10, %r9d # encoding: [0x41,0x83,0xf9,0x0a] ; WIN64-NEXT: jb .LBB3_10 # encoding: [0x72,A] ; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 @@ -503,7 +503,7 @@ ; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; WIN64-NEXT: movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09] ; WIN64-NEXT: addl $-48, %r9d # encoding: [0x41,0x83,0xc1,0xd0] -; WIN64-NEXT: movl $2, %eax # encoding: [0xb8,0x02,0x00,0x00,0x00] +; WIN64-NEXT: movl $2, %r8d # encoding: [0x41,0xb8,0x02,0x00,0x00,0x00] ; WIN64-NEXT: cmpl $10, %r9d # encoding: [0x41,0x83,0xf9,0x0a] ; WIN64-NEXT: jae _Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEES3_ # TAILCALL ; WIN64-NEXT: # encoding: [0x73,A] @@ -511,11 +511,11 @@ ; WIN64-NEXT: .LBB3_10: # %for.inc ; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; WIN64-NEXT: incq %rcx # encoding: [0x48,0xff,0xc1] -; WIN64-NEXT: decq %r8 # encoding: [0x49,0xff,0xc8] +; WIN64-NEXT: decq %rax # encoding: [0x48,0xff,0xc8] ; WIN64-NEXT: jmp .LBB3_1 # encoding: [0xeb,A] ; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_1-1, kind: FK_PCRel_1 ; WIN64-NEXT: .LBB3_11: -; WIN64-NEXT: cmpl $2, %eax # encoding: [0x83,0xf8,0x02] +; WIN64-NEXT: cmpl $2, %r8d # encoding: [0x41,0x83,0xf8,0x02] ; WIN64-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; WIN64-NEXT: # kill: def $al killed $al killed $eax ; WIN64-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/copy-eflags.ll b/llvm/test/CodeGen/X86/copy-eflags.ll --- a/llvm/test/CodeGen/X86/copy-eflags.ll +++ b/llvm/test/CodeGen/X86/copy-eflags.ll @@ -247,25 +247,25 @@ ; ; X64-LABEL: PR37100: ; X64: # %bb.0: # %bb -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movl {{[0-9]+}}(%rsp), %r10d -; X64-NEXT: movzbl %cl, %r11d +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movl {{[0-9]+}}(%rsp), %esi +; X64-NEXT: movzbl %cl, %ecx ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB3_1: # %bb1 ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: movsbq %dil, %rax -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: cmpq %rax, %rsi -; X64-NEXT: setl %cl -; X64-NEXT: negl %ecx -; X64-NEXT: cmpq %rax, %rsi +; X64-NEXT: xorl %r11d, %r11d +; X64-NEXT: cmpq %rax, %r10 +; X64-NEXT: setl %r11b +; X64-NEXT: negl %r11d +; X64-NEXT: cmpq %rax, %r10 ; X64-NEXT: movzbl %al, %edi -; X64-NEXT: cmovgel %r11d, %edi +; X64-NEXT: cmovgel %ecx, %edi ; X64-NEXT: movb %dil, (%r8) -; X64-NEXT: cmovgel (%r9), %ecx -; X64-NEXT: movl %r10d, %eax +; X64-NEXT: cmovgel (%r9), %r11d +; X64-NEXT: movl %esi, %eax ; X64-NEXT: cltd -; X64-NEXT: idivl %ecx +; X64-NEXT: idivl %r11d ; X64-NEXT: jmp .LBB3_1 bb: br label %bb1 diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll --- a/llvm/test/CodeGen/X86/ctpop-combine.ll +++ b/llvm/test/CodeGen/X86/ctpop-combine.ll @@ -162,27 +162,27 @@ define i1 @ctpop_trunc_non_power2(i255 %x) nounwind { ; CHECK-LABEL: ctpop_trunc_non_power2: ; CHECK: # %bb.0: -; CHECK-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: movq %rcx, %r9 -; CHECK-NEXT: andq %r8, %r9 -; CHECK-NEXT: movq %rdi, %r10 -; CHECK-NEXT: addq $-1, %r10 -; CHECK-NEXT: movq %rsi, %rax -; CHECK-NEXT: adcq $-1, %rax +; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: movq %rcx, %r8 +; CHECK-NEXT: andq %rax, %r8 +; CHECK-NEXT: movq %rdi, %r9 +; CHECK-NEXT: addq $-1, %r9 +; CHECK-NEXT: movq %rsi, %r10 +; CHECK-NEXT: adcq $-1, %r10 ; CHECK-NEXT: movq %rdx, %r11 ; CHECK-NEXT: adcq $-1, %r11 -; CHECK-NEXT: adcq %r8, %rcx -; CHECK-NEXT: andq %rdi, %r10 +; CHECK-NEXT: adcq %rax, %rcx +; CHECK-NEXT: andq %rdi, %r9 ; CHECK-NEXT: andq %rdx, %r11 -; CHECK-NEXT: orq %r10, %r11 -; CHECK-NEXT: andq %r9, %rcx -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: orq %rcx, %rax -; CHECK-NEXT: orq %r11, %rax +; CHECK-NEXT: orq %r9, %r11 +; CHECK-NEXT: andq %r8, %rcx +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: orq %rcx, %r10 +; CHECK-NEXT: orq %r11, %r10 ; CHECK-NEXT: sete %cl ; CHECK-NEXT: orq %rdx, %rdi -; CHECK-NEXT: orq %rsi, %r9 -; CHECK-NEXT: orq %rdi, %r9 +; CHECK-NEXT: orq %rsi, %r8 +; CHECK-NEXT: orq %rdi, %r8 ; CHECK-NEXT: setne %al ; CHECK-NEXT: andb %cl, %al ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll --- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll +++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll @@ -24,38 +24,38 @@ ; CHECK-LABEL: _Z1nv: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq k@GOTPCREL(%rip), %rax -; CHECK-NEXT: movl 4(%rax), %r11d +; CHECK-NEXT: movl 4(%rax), %edx ; CHECK-NEXT: movq c@GOTPCREL(%rip), %rax -; CHECK-NEXT: movswl (%rax), %r10d -; CHECK-NEXT: movq b@GOTPCREL(%rip), %r8 -; CHECK-NEXT: movswl (%r8), %r9d +; CHECK-NEXT: movswl (%rax), %ecx +; CHECK-NEXT: movq b@GOTPCREL(%rip), %rax +; CHECK-NEXT: movswl (%rax), %edi ; CHECK-NEXT: movq a@GOTPCREL(%rip), %rsi ; CHECK-NEXT: movl (%rsi), %esi -; CHECK-NEXT: movq l@GOTPCREL(%rip), %rax -; CHECK-NEXT: movl (%rax), %edi -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shll $7, %eax -; CHECK-NEXT: sarl $7, %eax -; CHECK-NEXT: negl %eax +; CHECK-NEXT: movq l@GOTPCREL(%rip), %r8 +; CHECK-NEXT: movl (%r8), %r8d +; CHECK-NEXT: movl %r8d, %r9d +; CHECK-NEXT: shll $7, %r9d +; CHECK-NEXT: sarl $7, %r9d +; CHECK-NEXT: negl %r9d ; CHECK-NEXT: testl %esi, %esi -; CHECK-NEXT: cmovel %esi, %eax -; CHECK-NEXT: movzwl %r11w, %ecx -; CHECK-NEXT: leal (%r10,%rcx,2), %ecx -; CHECK-NEXT: addl %r9d, %ecx -; CHECK-NEXT: cmpl %eax, %ecx -; CHECK-NEXT: sete %al -; CHECK-NEXT: testl $33554431, %edi # imm = 0x1FFFFFF -; CHECK-NEXT: sete %dl -; CHECK-NEXT: orb %al, %dl -; CHECK-NEXT: movzbl %dl, %eax -; CHECK-NEXT: movq e@GOTPCREL(%rip), %rdx -; CHECK-NEXT: movw %ax, (%rdx) +; CHECK-NEXT: cmovel %esi, %r9d +; CHECK-NEXT: movzwl %dx, %r10d +; CHECK-NEXT: leal (%rcx,%r10,2), %ecx +; CHECK-NEXT: addl %edi, %ecx +; CHECK-NEXT: cmpl %r9d, %ecx +; CHECK-NEXT: sete %dil +; CHECK-NEXT: testl $33554431, %r8d # imm = 0x1FFFFFF +; CHECK-NEXT: sete %r8b +; CHECK-NEXT: orb %dil, %r8b +; CHECK-NEXT: movzbl %r8b, %edi +; CHECK-NEXT: movq e@GOTPCREL(%rip), %r8 +; CHECK-NEXT: movw %di, (%r8) ; CHECK-NEXT: notl %ecx ; CHECK-NEXT: shrl $31, %ecx -; CHECK-NEXT: addl %r11d, %ecx +; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: sarl %cl, %esi -; CHECK-NEXT: movw %si, (%r8) +; CHECK-NEXT: movw %si, (%rax) ; CHECK-NEXT: retq entry: %bf.load = load i32, i32* bitcast (i24* getelementptr inbounds (%struct.m, %struct.m* @k, i64 0, i32 0, i32 1) to i32*), align 4 @@ -115,115 +115,115 @@ ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq x1@GOTPCREL(%rip), %rax -; CHECK-NEXT: movl (%rax), %edx -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: andl $511, %eax # imm = 0x1FF -; CHECK-NEXT: leaq 1(%rax), %rsi +; CHECK-NEXT: movl (%rax), %ebx +; CHECK-NEXT: movl %ebx, %r9d +; CHECK-NEXT: andl $511, %r9d # imm = 0x1FF +; CHECK-NEXT: leaq 1(%r9), %rax ; CHECK-NEXT: movq x4@GOTPCREL(%rip), %rcx -; CHECK-NEXT: movl %esi, (%rcx) +; CHECK-NEXT: movl %eax, (%rcx) ; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rcx ; CHECK-NEXT: movl (%rcx), %ecx ; CHECK-NEXT: testl %ecx, %ecx ; CHECK-NEXT: je .LBB1_18 ; CHECK-NEXT: # %bb.1: # %for.cond1thread-pre-split.lr.ph -; CHECK-NEXT: movq x5@GOTPCREL(%rip), %rdi -; CHECK-NEXT: movq (%rdi), %r12 -; CHECK-NEXT: movl %ecx, %edi -; CHECK-NEXT: notl %edi -; CHECK-NEXT: leaq 8(,%rdi,8), %r14 -; CHECK-NEXT: imulq %rsi, %r14 -; CHECK-NEXT: addq %r12, %r14 -; CHECK-NEXT: movq x2@GOTPCREL(%rip), %r15 -; CHECK-NEXT: movl (%r15), %ebx -; CHECK-NEXT: leal 8(,%rax,8), %eax +; CHECK-NEXT: movq x5@GOTPCREL(%rip), %rdx +; CHECK-NEXT: movq (%rdx), %rsi +; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: notl %edx +; CHECK-NEXT: leaq 8(,%rdx,8), %rdi +; CHECK-NEXT: imulq %rax, %rdi +; CHECK-NEXT: addq %rsi, %rdi +; CHECK-NEXT: movq x2@GOTPCREL(%rip), %r8 +; CHECK-NEXT: movl (%r8), %edx +; CHECK-NEXT: leal 8(,%r9,8), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: leaq 8(%r12), %rax +; CHECK-NEXT: leaq 8(%rsi), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: leaq 32(%r12), %rax -; CHECK-NEXT: andl $511, %edx # imm = 0x1FF -; CHECK-NEXT: leaq 8(,%rdx,8), %r13 -; CHECK-NEXT: xorl %edi, %edi -; CHECK-NEXT: movq x0@GOTPCREL(%rip), %rdx -; CHECK-NEXT: movq %r12, %rsi +; CHECK-NEXT: leaq 32(%rsi), %r11 +; CHECK-NEXT: andl $511, %ebx # imm = 0x1FF +; CHECK-NEXT: leaq 8(,%rbx,8), %rbx +; CHECK-NEXT: xorl %r14d, %r14d +; CHECK-NEXT: movq x0@GOTPCREL(%rip), %r15 +; CHECK-NEXT: movq %rsi, %r12 ; CHECK-NEXT: jmp .LBB1_2 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_15: # %for.cond1.for.inc3_crit_edge ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: movl %ebx, (%r15) +; CHECK-NEXT: movl %edx, (%r8) ; CHECK-NEXT: .LBB1_16: # %for.inc3 ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: addq %r13, %rsi -; CHECK-NEXT: incq %rdi -; CHECK-NEXT: addq %r13, %rax +; CHECK-NEXT: addq %rbx, %r12 +; CHECK-NEXT: incq %r14 +; CHECK-NEXT: addq %rbx, %r11 ; CHECK-NEXT: incl %ecx ; CHECK-NEXT: je .LBB1_17 ; CHECK-NEXT: .LBB1_2: # %for.cond1thread-pre-split ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB1_12 Depth 2 ; CHECK-NEXT: # Child Loop BB1_14 Depth 2 -; CHECK-NEXT: testl %ebx, %ebx +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jns .LBB1_16 ; CHECK-NEXT: # %bb.3: # %for.body2.preheader ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: movslq %ebx, %r9 -; CHECK-NEXT: testq %r9, %r9 +; CHECK-NEXT: movslq %edx, %r13 +; CHECK-NEXT: testq %r13, %r13 ; CHECK-NEXT: movq $-1, %rbp -; CHECK-NEXT: cmovnsq %r9, %rbp -; CHECK-NEXT: subq %r9, %rbp +; CHECK-NEXT: cmovnsq %r13, %rbp +; CHECK-NEXT: subq %r13, %rbp ; CHECK-NEXT: incq %rbp ; CHECK-NEXT: cmpq $4, %rbp ; CHECK-NEXT: jb .LBB1_14 ; CHECK-NEXT: # %bb.4: # %min.iters.checked ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: movq %rbp, %r8 -; CHECK-NEXT: andq $-4, %r8 +; CHECK-NEXT: movq %rbp, %rdx +; CHECK-NEXT: andq $-4, %rdx ; CHECK-NEXT: je .LBB1_14 ; CHECK-NEXT: # %bb.5: # %vector.memcheck ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; CHECK-NEXT: imulq %rdi, %r11 -; CHECK-NEXT: leaq (%r12,%r11), %rbx -; CHECK-NEXT: leaq (%rbx,%r9,8), %rbx -; CHECK-NEXT: testq %r9, %r9 +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: imulq %r14, %rax +; CHECK-NEXT: leaq (%rsi,%rax), %r10 +; CHECK-NEXT: leaq (%r10,%r13,8), %r9 +; CHECK-NEXT: testq %r13, %r13 ; CHECK-NEXT: movq $-1, %r10 -; CHECK-NEXT: cmovnsq %r9, %r10 -; CHECK-NEXT: cmpq %rdx, %rbx +; CHECK-NEXT: cmovnsq %r13, %r10 +; CHECK-NEXT: cmpq %r15, %r9 ; CHECK-NEXT: jae .LBB1_7 ; CHECK-NEXT: # %bb.6: # %vector.memcheck ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; CHECK-NEXT: leaq (%r11,%r10,8), %rbx -; CHECK-NEXT: cmpq %rdx, %rbx +; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; CHECK-NEXT: leaq (%rax,%r10,8), %rax +; CHECK-NEXT: cmpq %r15, %rax ; CHECK-NEXT: ja .LBB1_14 ; CHECK-NEXT: .LBB1_7: # %vector.body.preheader ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: leaq -4(%r8), %rbx -; CHECK-NEXT: movq %rbx, %r11 -; CHECK-NEXT: shrq $2, %r11 -; CHECK-NEXT: btl $2, %ebx +; CHECK-NEXT: leaq -4(%rdx), %r9 +; CHECK-NEXT: movq %r9, %rax +; CHECK-NEXT: shrq $2, %rax +; CHECK-NEXT: btl $2, %r9d ; CHECK-NEXT: jb .LBB1_8 ; CHECK-NEXT: # %bb.9: # %vector.body.prol.preheader ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; CHECK-NEXT: movdqu %xmm0, (%rsi,%r9,8) -; CHECK-NEXT: movdqu %xmm0, 16(%rsi,%r9,8) +; CHECK-NEXT: movdqu %xmm0, (%r12,%r13,8) +; CHECK-NEXT: movdqu %xmm0, 16(%r12,%r13,8) ; CHECK-NEXT: movl $4, %r10d -; CHECK-NEXT: testq %r11, %r11 +; CHECK-NEXT: testq %rax, %rax ; CHECK-NEXT: jne .LBB1_11 ; CHECK-NEXT: jmp .LBB1_13 ; CHECK-NEXT: .LBB1_8: # in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: xorl %r10d, %r10d -; CHECK-NEXT: testq %r11, %r11 +; CHECK-NEXT: testq %rax, %rax ; CHECK-NEXT: je .LBB1_13 ; CHECK-NEXT: .LBB1_11: # %vector.body.preheader.new ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; CHECK-NEXT: movq %r10, %rbx -; CHECK-NEXT: subq %r8, %rbx -; CHECK-NEXT: addq %r9, %r10 -; CHECK-NEXT: leaq (%rax,%r10,8), %r10 +; CHECK-NEXT: movq %r10, %rax +; CHECK-NEXT: subq %rdx, %rax +; CHECK-NEXT: addq %r13, %r10 +; CHECK-NEXT: leaq (%r11,%r10,8), %r10 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_12: # %vector.body ; CHECK-NEXT: # Parent Loop BB1_2 Depth=1 @@ -233,28 +233,28 @@ ; CHECK-NEXT: movdqu %xmm0, (%r10) ; CHECK-NEXT: movdqu %xmm0, 16(%r10) ; CHECK-NEXT: addq $64, %r10 -; CHECK-NEXT: addq $8, %rbx +; CHECK-NEXT: addq $8, %rax ; CHECK-NEXT: jne .LBB1_12 ; CHECK-NEXT: .LBB1_13: # %middle.block ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: addq %r8, %r9 -; CHECK-NEXT: cmpq %r8, %rbp -; CHECK-NEXT: movq %r9, %rbx +; CHECK-NEXT: addq %rdx, %r13 +; CHECK-NEXT: cmpq %rdx, %rbp +; CHECK-NEXT: movq %r13, %rdx ; CHECK-NEXT: je .LBB1_15 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_14: # %for.body2 ; CHECK-NEXT: # Parent Loop BB1_2 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movq (%rdx), %rbp -; CHECK-NEXT: movq %rbp, (%rsi,%r9,8) -; CHECK-NEXT: leaq 1(%r9), %rbx -; CHECK-NEXT: cmpq $-1, %r9 -; CHECK-NEXT: movq %rbx, %r9 +; CHECK-NEXT: movq (%r15), %rax +; CHECK-NEXT: movq %rax, (%r12,%r13,8) +; CHECK-NEXT: leaq 1(%r13), %rdx +; CHECK-NEXT: cmpq $-1, %r13 +; CHECK-NEXT: movq %rdx, %r13 ; CHECK-NEXT: jl .LBB1_14 ; CHECK-NEXT: jmp .LBB1_15 ; CHECK-NEXT: .LBB1_17: # %for.cond.for.end5_crit_edge ; CHECK-NEXT: movq x5@GOTPCREL(%rip), %rax -; CHECK-NEXT: movq %r14, (%rax) +; CHECK-NEXT: movq %rdi, (%rax) ; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rax ; CHECK-NEXT: movl $0, (%rax) ; CHECK-NEXT: .LBB1_18: # %for.end5 diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll --- a/llvm/test/CodeGen/X86/dagcombine-cse.ll +++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll @@ -114,20 +114,20 @@ ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rsi +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %r8 ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: movq %r8, %rax +; X64-NEXT: addq %r8, %rdx +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: adcq $0, %rax -; X64-NEXT: addq %rdx, %rsi -; X64-NEXT: adcq %r8, %rax +; X64-NEXT: addq %rdx, %r8 +; X64-NEXT: adcq %rsi, %rax ; X64-NEXT: imulq %rcx, %rcx ; X64-NEXT: addq %rax, %rcx -; X64-NEXT: shrdq $32, %rcx, %rsi +; X64-NEXT: shrdq $32, %rcx, %r8 ; X64-NEXT: shrq $32, %rcx -; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %r8, %rax ; X64-NEXT: movq %rcx, %rdx ; X64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -181,18 +181,18 @@ ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %r8, %r14 -; X64-NEXT: movq %rcx, %rbx +; X64-NEXT: movq %r8, %rbx +; X64-NEXT: movq %rcx, %r14 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rsi, %r12 ; X64-NEXT: movq %rdi, %r13 ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rdx, 8(%r14) -; X64-NEXT: movq %rax, (%r14) -; X64-NEXT: imulq %rax, %rbx +; X64-NEXT: movq %rdx, 8(%rbx) +; X64-NEXT: movq %rax, (%rbx) +; X64-NEXT: imulq %rax, %r14 ; X64-NEXT: mulq %r15 -; X64-NEXT: addq %rbx, %rdx +; X64-NEXT: addq %r14, %rdx ; X64-NEXT: imulq %r15, %rcx ; X64-NEXT: addq %rdx, %rcx ; X64-NEXT: subq %rax, %r13 @@ -343,40 +343,40 @@ ; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r8d +; X64-NEXT: movzbl %al, %edi ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r9d +; X64-NEXT: movzbl %al, %esi ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r10d +; X64-NEXT: movzbl %al, %r8d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r11d +; X64-NEXT: movzbl %al, %r9d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r14d +; X64-NEXT: movzbl %al, %r10d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r15d +; X64-NEXT: movzbl %al, %r11d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r12d +; X64-NEXT: movzbl %al, %ebx ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r13d +; X64-NEXT: movzbl %al, %ebp ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %edi +; X64-NEXT: movzbl %al, %r14d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %esi +; X64-NEXT: movzbl %al, %r15d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %ebx +; X64-NEXT: movzbl %al, %r12d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %ebp +; X64-NEXT: movzbl %al, %r13d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) ; X64-NEXT: movzbl %al, %edx @@ -385,26 +385,26 @@ ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movd %r8d, %xmm3 -; X64-NEXT: movd %r9d, %xmm4 -; X64-NEXT: movd %r10d, %xmm5 -; X64-NEXT: movd %r11d, %xmm6 +; X64-NEXT: movd %edi, %xmm3 +; X64-NEXT: movd %esi, %xmm4 +; X64-NEXT: movd %r8d, %xmm5 +; X64-NEXT: movd %r9d, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-NEXT: movd %r14d, %xmm2 +; X64-NEXT: movd %r10d, %xmm2 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; X64-NEXT: movd %r15d, %xmm4 +; X64-NEXT: movd %r11d, %xmm4 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; X64-NEXT: movd %r12d, %xmm3 +; X64-NEXT: movd %ebx, %xmm3 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; X64-NEXT: movd %r13d, %xmm6 +; X64-NEXT: movd %ebp, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-NEXT: movd %edi, %xmm4 +; X64-NEXT: movd %r14d, %xmm4 ; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-NEXT: movd %esi, %xmm2 +; X64-NEXT: movd %r15d, %xmm2 ; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; X64-NEXT: movd %ebx, %xmm5 +; X64-NEXT: movd %r12d, %xmm5 ; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; X64-NEXT: movd %ebp, %xmm6 +; X64-NEXT: movd %r13d, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] ; X64-NEXT: movd %edx, %xmm2 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -181,18 +181,18 @@ ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %r8, %r14 -; X64-NEXT: movq %rcx, %rbx +; X64-NEXT: movq %r8, %rbx +; X64-NEXT: movq %rcx, %r14 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rsi, %r12 ; X64-NEXT: movq %rdi, %r13 ; X64-NEXT: callq __udivti3@PLT ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rdx, 8(%r14) -; X64-NEXT: movq %rax, (%r14) -; X64-NEXT: imulq %rax, %rbx +; X64-NEXT: movq %rdx, 8(%rbx) +; X64-NEXT: movq %rax, (%rbx) +; X64-NEXT: imulq %rax, %r14 ; X64-NEXT: mulq %r15 -; X64-NEXT: addq %rbx, %rdx +; X64-NEXT: addq %r14, %rdx ; X64-NEXT: imulq %r15, %rcx ; X64-NEXT: addq %rdx, %rcx ; X64-NEXT: subq %rax, %r13 @@ -343,40 +343,40 @@ ; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r8d +; X64-NEXT: movzbl %al, %edi ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r9d +; X64-NEXT: movzbl %al, %esi ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r10d +; X64-NEXT: movzbl %al, %r8d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r11d +; X64-NEXT: movzbl %al, %r9d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r14d +; X64-NEXT: movzbl %al, %r10d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r15d +; X64-NEXT: movzbl %al, %r11d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r12d +; X64-NEXT: movzbl %al, %ebx ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r13d +; X64-NEXT: movzbl %al, %ebp ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %edi +; X64-NEXT: movzbl %al, %r14d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %esi +; X64-NEXT: movzbl %al, %r15d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %ebx +; X64-NEXT: movzbl %al, %r12d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %ebp +; X64-NEXT: movzbl %al, %r13d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) ; X64-NEXT: movzbl %al, %edx @@ -385,26 +385,26 @@ ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movd %r8d, %xmm3 -; X64-NEXT: movd %r9d, %xmm4 -; X64-NEXT: movd %r10d, %xmm5 -; X64-NEXT: movd %r11d, %xmm6 +; X64-NEXT: movd %edi, %xmm3 +; X64-NEXT: movd %esi, %xmm4 +; X64-NEXT: movd %r8d, %xmm5 +; X64-NEXT: movd %r9d, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-NEXT: movd %r14d, %xmm2 +; X64-NEXT: movd %r10d, %xmm2 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; X64-NEXT: movd %r15d, %xmm4 +; X64-NEXT: movd %r11d, %xmm4 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; X64-NEXT: movd %r12d, %xmm3 +; X64-NEXT: movd %ebx, %xmm3 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; X64-NEXT: movd %r13d, %xmm6 +; X64-NEXT: movd %ebp, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-NEXT: movd %edi, %xmm4 +; X64-NEXT: movd %r14d, %xmm4 ; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-NEXT: movd %esi, %xmm2 +; X64-NEXT: movd %r15d, %xmm2 ; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; X64-NEXT: movd %ebx, %xmm5 +; X64-NEXT: movd %r12d, %xmm5 ; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; X64-NEXT: movd %ebp, %xmm6 +; X64-NEXT: movd %r13d, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] ; X64-NEXT: movd %edx, %xmm2 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -479,21 +479,21 @@ ; WIN64: # %bb.0: # %entry ; WIN64-NEXT: movq %rdx, %r8 ; WIN64-NEXT: movq %rcx, %r9 -; WIN64-NEXT: addq %rdx, %rcx -; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: addq %rdx, %r9 +; WIN64-NEXT: adcq $0, %r9 ; WIN64-NEXT: movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB -; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: movq %r9, %rax ; WIN64-NEXT: mulq %r10 ; WIN64-NEXT: shrq %rdx ; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax -; WIN64-NEXT: subq %rax, %rcx -; WIN64-NEXT: subq %rcx, %r9 +; WIN64-NEXT: subq %rax, %r9 +; WIN64-NEXT: subq %r9, %rcx ; WIN64-NEXT: sbbq $0, %r8 -; WIN64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA -; WIN64-NEXT: imulq %r9, %rcx -; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA +; WIN64-NEXT: imulq %rcx, %r9 +; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 -; WIN64-NEXT: addq %rcx, %rdx +; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 ; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq @@ -529,21 +529,21 @@ ; WIN64: # %bb.0: # %entry ; WIN64-NEXT: movq %rdx, %r8 ; WIN64-NEXT: movq %rcx, %r9 -; WIN64-NEXT: addq %rdx, %rcx -; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: addq %rdx, %r9 +; WIN64-NEXT: adcq $0, %r9 ; WIN64-NEXT: movabsq $-3689348814741910323, %r10 # imm = 0xCCCCCCCCCCCCCCCD -; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: movq %r9, %rax ; WIN64-NEXT: mulq %r10 ; WIN64-NEXT: shrq $2, %rdx ; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax -; WIN64-NEXT: subq %rax, %rcx -; WIN64-NEXT: subq %rcx, %r9 +; WIN64-NEXT: subq %rax, %r9 +; WIN64-NEXT: subq %r9, %rcx ; WIN64-NEXT: sbbq $0, %r8 -; WIN64-NEXT: movabsq $-3689348814741910324, %rcx # imm = 0xCCCCCCCCCCCCCCCC -; WIN64-NEXT: imulq %r9, %rcx -; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: movabsq $-3689348814741910324, %r9 # imm = 0xCCCCCCCCCCCCCCCC +; WIN64-NEXT: imulq %rcx, %r9 +; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 -; WIN64-NEXT: addq %rcx, %rdx +; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 ; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq @@ -567,37 +567,37 @@ ; X86-64-NEXT: subq %rax, %rcx ; X86-64-NEXT: subq %rcx, %rdi ; X86-64-NEXT: sbbq $0, %rsi -; X86-64-NEXT: movabsq $-1229782938247303442, %r8 # imm = 0xEEEEEEEEEEEEEEEE -; X86-64-NEXT: imulq %rdi, %r8 -; X86-64-NEXT: movabsq $-1229782938247303441, %rcx # imm = 0xEEEEEEEEEEEEEEEF +; X86-64-NEXT: movabsq $-1229782938247303442, %rcx # imm = 0xEEEEEEEEEEEEEEEE +; X86-64-NEXT: imulq %rdi, %rcx +; X86-64-NEXT: movabsq $-1229782938247303441, %r8 # imm = 0xEEEEEEEEEEEEEEEF ; X86-64-NEXT: movq %rdi, %rax -; X86-64-NEXT: mulq %rcx -; X86-64-NEXT: addq %r8, %rdx -; X86-64-NEXT: imulq %rsi, %rcx +; X86-64-NEXT: mulq %r8 ; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: imulq %rsi, %r8 +; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_i128_15: ; WIN64: # %bb.0: # %entry ; WIN64-NEXT: movq %rdx, %r8 ; WIN64-NEXT: movq %rcx, %r9 -; WIN64-NEXT: addq %rdx, %rcx -; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: addq %rdx, %r9 +; WIN64-NEXT: adcq $0, %r9 ; WIN64-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 -; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: movq %r9, %rax ; WIN64-NEXT: mulq %rdx ; WIN64-NEXT: shrq $3, %rdx ; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax ; WIN64-NEXT: leaq (%rax,%rax,2), %rax -; WIN64-NEXT: subq %rax, %rcx -; WIN64-NEXT: subq %rcx, %r9 +; WIN64-NEXT: subq %rax, %r9 +; WIN64-NEXT: subq %r9, %rcx ; WIN64-NEXT: sbbq $0, %r8 -; WIN64-NEXT: movabsq $-1229782938247303442, %rcx # imm = 0xEEEEEEEEEEEEEEEE -; WIN64-NEXT: imulq %r9, %rcx +; WIN64-NEXT: movabsq $-1229782938247303442, %r9 # imm = 0xEEEEEEEEEEEEEEEE +; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movabsq $-1229782938247303441, %r10 # imm = 0xEEEEEEEEEEEEEEEF -; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 -; WIN64-NEXT: addq %rcx, %rdx +; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 ; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq @@ -635,23 +635,23 @@ ; WIN64: # %bb.0: # %entry ; WIN64-NEXT: movq %rdx, %r8 ; WIN64-NEXT: movq %rcx, %r9 -; WIN64-NEXT: addq %rdx, %rcx -; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: addq %rdx, %r9 +; WIN64-NEXT: adcq $0, %r9 ; WIN64-NEXT: movabsq $-1085102592571150095, %r10 # imm = 0xF0F0F0F0F0F0F0F1 -; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: movq %r9, %rax ; WIN64-NEXT: mulq %r10 ; WIN64-NEXT: movq %rdx, %rax ; WIN64-NEXT: andq $-16, %rax ; WIN64-NEXT: shrq $4, %rdx ; WIN64-NEXT: addq %rax, %rdx -; WIN64-NEXT: subq %rdx, %rcx -; WIN64-NEXT: subq %rcx, %r9 +; WIN64-NEXT: subq %rdx, %r9 +; WIN64-NEXT: subq %r9, %rcx ; WIN64-NEXT: sbbq $0, %r8 -; WIN64-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 -; WIN64-NEXT: imulq %r9, %rcx -; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: movabsq $-1085102592571150096, %r9 # imm = 0xF0F0F0F0F0F0F0F0 +; WIN64-NEXT: imulq %rcx, %r9 +; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 -; WIN64-NEXT: addq %rcx, %rdx +; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 ; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq @@ -677,14 +677,14 @@ ; X86-64-NEXT: adcq %rdx, %rax ; X86-64-NEXT: subq %rax, %rdi ; X86-64-NEXT: sbbq $0, %rsi -; X86-64-NEXT: movabsq $-72340172838076674, %r8 # imm = 0xFEFEFEFEFEFEFEFE -; X86-64-NEXT: imulq %rdi, %r8 -; X86-64-NEXT: movabsq $-72340172838076673, %rcx # imm = 0xFEFEFEFEFEFEFEFF +; X86-64-NEXT: movabsq $-72340172838076674, %rcx # imm = 0xFEFEFEFEFEFEFEFE +; X86-64-NEXT: imulq %rdi, %rcx +; X86-64-NEXT: movabsq $-72340172838076673, %r8 # imm = 0xFEFEFEFEFEFEFEFF ; X86-64-NEXT: movq %rdi, %rax -; X86-64-NEXT: mulq %rcx -; X86-64-NEXT: addq %r8, %rdx -; X86-64-NEXT: imulq %rsi, %rcx +; X86-64-NEXT: mulq %r8 ; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: imulq %rsi, %r8 +; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_i128_255: @@ -747,23 +747,23 @@ ; WIN64: # %bb.0: # %entry ; WIN64-NEXT: movq %rdx, %r8 ; WIN64-NEXT: movq %rcx, %r9 -; WIN64-NEXT: addq %rdx, %rcx -; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: addq %rdx, %r9 +; WIN64-NEXT: adcq $0, %r9 ; WIN64-NEXT: movabsq $-71777214294589695, %r10 # imm = 0xFF00FF00FF00FF01 -; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: movq %r9, %rax ; WIN64-NEXT: mulq %r10 ; WIN64-NEXT: movq %rdx, %rax ; WIN64-NEXT: andq $-256, %rax ; WIN64-NEXT: shrq $8, %rdx ; WIN64-NEXT: addq %rax, %rdx -; WIN64-NEXT: subq %rdx, %rcx -; WIN64-NEXT: subq %rcx, %r9 +; WIN64-NEXT: subq %rdx, %r9 +; WIN64-NEXT: subq %r9, %rcx ; WIN64-NEXT: sbbq $0, %r8 -; WIN64-NEXT: movabsq $-71777214294589696, %rcx # imm = 0xFF00FF00FF00FF00 -; WIN64-NEXT: imulq %r9, %rcx -; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: movabsq $-71777214294589696, %r9 # imm = 0xFF00FF00FF00FF00 +; WIN64-NEXT: imulq %rcx, %r9 +; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 -; WIN64-NEXT: addq %rcx, %rdx +; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 ; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq @@ -789,14 +789,14 @@ ; X86-64-NEXT: adcq %rdx, %rax ; X86-64-NEXT: subq %rax, %rdi ; X86-64-NEXT: sbbq $0, %rsi -; X86-64-NEXT: movabsq $-281479271743490, %r8 # imm = 0xFFFEFFFEFFFEFFFE -; X86-64-NEXT: imulq %rdi, %r8 -; X86-64-NEXT: movabsq $-281479271743489, %rcx # imm = 0xFFFEFFFEFFFEFFFF +; X86-64-NEXT: movabsq $-281479271743490, %rcx # imm = 0xFFFEFFFEFFFEFFFE +; X86-64-NEXT: imulq %rdi, %rcx +; X86-64-NEXT: movabsq $-281479271743489, %r8 # imm = 0xFFFEFFFEFFFEFFFF ; X86-64-NEXT: movq %rdi, %rax -; X86-64-NEXT: mulq %rcx -; X86-64-NEXT: addq %r8, %rdx -; X86-64-NEXT: imulq %rsi, %rcx +; X86-64-NEXT: mulq %r8 ; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: imulq %rsi, %r8 +; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_i128_65535: @@ -859,23 +859,23 @@ ; WIN64: # %bb.0: # %entry ; WIN64-NEXT: movq %rdx, %r8 ; WIN64-NEXT: movq %rcx, %r9 -; WIN64-NEXT: addq %rdx, %rcx -; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: addq %rdx, %r9 +; WIN64-NEXT: adcq $0, %r9 ; WIN64-NEXT: movabsq $-281470681808895, %r10 # imm = 0xFFFF0000FFFF0001 -; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: movq %r9, %rax ; WIN64-NEXT: mulq %r10 ; WIN64-NEXT: movq %rdx, %rax ; WIN64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000 ; WIN64-NEXT: shrq $16, %rdx ; WIN64-NEXT: addq %rax, %rdx -; WIN64-NEXT: subq %rdx, %rcx -; WIN64-NEXT: subq %rcx, %r9 +; WIN64-NEXT: subq %rdx, %r9 +; WIN64-NEXT: subq %r9, %rcx ; WIN64-NEXT: sbbq $0, %r8 -; WIN64-NEXT: movabsq $-281470681808896, %rcx # imm = 0xFFFF0000FFFF0000 -; WIN64-NEXT: imulq %r9, %rcx -; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: movabsq $-281470681808896, %r9 # imm = 0xFFFF0000FFFF0000 +; WIN64-NEXT: imulq %rcx, %r9 +; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 -; WIN64-NEXT: addq %rcx, %rdx +; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 ; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll --- a/llvm/test/CodeGen/X86/extract-bits.ll +++ b/llvm/test/CodeGen/X86/extract-bits.ll @@ -1921,58 +1921,58 @@ ; ; X64-NOBMI-LABEL: bextr64_32_a1_trunc_extrause: ; X64-NOBMI: # %bb.0: -; X64-NOBMI-NEXT: pushq %rbp +; X64-NOBMI-NEXT: pushq %r14 ; X64-NOBMI-NEXT: pushq %rbx ; X64-NOBMI-NEXT: pushq %rax -; X64-NOBMI-NEXT: movl %edx, %ebp +; X64-NOBMI-NEXT: movl %edx, %ebx ; X64-NOBMI-NEXT: movq %rsi, %rcx -; X64-NOBMI-NEXT: movq %rdi, %rbx +; X64-NOBMI-NEXT: movq %rdi, %r14 ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-NOBMI-NEXT: shrq %cl, %rbx -; X64-NOBMI-NEXT: movl %ebx, %edi +; X64-NOBMI-NEXT: shrq %cl, %r14 +; X64-NOBMI-NEXT: movl %r14d, %edi ; X64-NOBMI-NEXT: callq use32@PLT ; X64-NOBMI-NEXT: movl $1, %eax -; X64-NOBMI-NEXT: movl %ebp, %ecx +; X64-NOBMI-NEXT: movl %ebx, %ecx ; X64-NOBMI-NEXT: shll %cl, %eax ; X64-NOBMI-NEXT: decl %eax -; X64-NOBMI-NEXT: andl %ebx, %eax +; X64-NOBMI-NEXT: andl %r14d, %eax ; X64-NOBMI-NEXT: addq $8, %rsp ; X64-NOBMI-NEXT: popq %rbx -; X64-NOBMI-NEXT: popq %rbp +; X64-NOBMI-NEXT: popq %r14 ; X64-NOBMI-NEXT: retq ; ; X64-BMI1-LABEL: bextr64_32_a1_trunc_extrause: ; X64-BMI1: # %bb.0: -; X64-BMI1-NEXT: pushq %rbp +; X64-BMI1-NEXT: pushq %r14 ; X64-BMI1-NEXT: pushq %rbx ; X64-BMI1-NEXT: pushq %rax -; X64-BMI1-NEXT: movl %edx, %ebp +; X64-BMI1-NEXT: movl %edx, %ebx ; X64-BMI1-NEXT: movq %rsi, %rcx -; X64-BMI1-NEXT: movq %rdi, %rbx +; X64-BMI1-NEXT: movq %rdi, %r14 ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-BMI1-NEXT: shrq %cl, %rbx -; X64-BMI1-NEXT: movl %ebx, %edi +; X64-BMI1-NEXT: shrq %cl, %r14 +; X64-BMI1-NEXT: movl %r14d, %edi ; X64-BMI1-NEXT: callq use32@PLT -; X64-BMI1-NEXT: shll $8, %ebp -; X64-BMI1-NEXT: bextrl %ebp, %ebx, %eax +; X64-BMI1-NEXT: shll $8, %ebx +; X64-BMI1-NEXT: bextrl %ebx, %r14d, %eax ; X64-BMI1-NEXT: addq $8, %rsp ; X64-BMI1-NEXT: popq %rbx -; X64-BMI1-NEXT: popq %rbp +; X64-BMI1-NEXT: popq %r14 ; X64-BMI1-NEXT: retq ; ; X64-BMI2-LABEL: bextr64_32_a1_trunc_extrause: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: pushq %rbp +; X64-BMI2-NEXT: pushq %r14 ; X64-BMI2-NEXT: pushq %rbx ; X64-BMI2-NEXT: pushq %rax -; X64-BMI2-NEXT: movl %edx, %ebp -; X64-BMI2-NEXT: shrxq %rsi, %rdi, %rbx -; X64-BMI2-NEXT: movl %ebx, %edi +; X64-BMI2-NEXT: movl %edx, %ebx +; X64-BMI2-NEXT: shrxq %rsi, %rdi, %r14 +; X64-BMI2-NEXT: movl %r14d, %edi ; X64-BMI2-NEXT: callq use32@PLT -; X64-BMI2-NEXT: bzhil %ebp, %ebx, %eax +; X64-BMI2-NEXT: bzhil %ebx, %r14d, %eax ; X64-BMI2-NEXT: addq $8, %rsp ; X64-BMI2-NEXT: popq %rbx -; X64-BMI2-NEXT: popq %rbp +; X64-BMI2-NEXT: popq %r14 ; X64-BMI2-NEXT: retq %shifted = lshr i64 %val, %numskipbits %truncshifted = trunc i64 %shifted to i32 @@ -4782,20 +4782,20 @@ ; X64-NOBMI-NEXT: pushq %rbp ; X64-NOBMI-NEXT: pushq %r14 ; X64-NOBMI-NEXT: pushq %rbx -; X64-NOBMI-NEXT: movl %esi, %r14d +; X64-NOBMI-NEXT: movl %esi, %ebx ; X64-NOBMI-NEXT: movl %edi, %ebp -; X64-NOBMI-NEXT: movl %r14d, %ecx +; X64-NOBMI-NEXT: movl %ebx, %ecx ; X64-NOBMI-NEXT: shrl %cl, %ebp ; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movl $-1, %ebx +; X64-NOBMI-NEXT: movl $-1, %r14d ; X64-NOBMI-NEXT: movl %edx, %ecx -; X64-NOBMI-NEXT: shrl %cl, %ebx -; X64-NOBMI-NEXT: movl %ebx, %edi -; X64-NOBMI-NEXT: callq use32@PLT -; X64-NOBMI-NEXT: andl %ebp, %ebx +; X64-NOBMI-NEXT: shrl %cl, %r14d ; X64-NOBMI-NEXT: movl %r14d, %edi ; X64-NOBMI-NEXT: callq use32@PLT -; X64-NOBMI-NEXT: movl %ebx, %eax +; X64-NOBMI-NEXT: andl %ebp, %r14d +; X64-NOBMI-NEXT: movl %ebx, %edi +; X64-NOBMI-NEXT: callq use32@PLT +; X64-NOBMI-NEXT: movl %r14d, %eax ; X64-NOBMI-NEXT: popq %rbx ; X64-NOBMI-NEXT: popq %r14 ; X64-NOBMI-NEXT: popq %rbp @@ -4806,20 +4806,20 @@ ; X64-BMI1-NEXT: pushq %rbp ; X64-BMI1-NEXT: pushq %r14 ; X64-BMI1-NEXT: pushq %rbx -; X64-BMI1-NEXT: movl %esi, %r14d +; X64-BMI1-NEXT: movl %esi, %ebx ; X64-BMI1-NEXT: movl %edi, %ebp -; X64-BMI1-NEXT: movl %r14d, %ecx +; X64-BMI1-NEXT: movl %ebx, %ecx ; X64-BMI1-NEXT: shrl %cl, %ebp ; X64-BMI1-NEXT: negb %dl -; X64-BMI1-NEXT: movl $-1, %ebx +; X64-BMI1-NEXT: movl $-1, %r14d ; X64-BMI1-NEXT: movl %edx, %ecx -; X64-BMI1-NEXT: shrl %cl, %ebx -; X64-BMI1-NEXT: movl %ebx, %edi -; X64-BMI1-NEXT: callq use32@PLT -; X64-BMI1-NEXT: andl %ebp, %ebx +; X64-BMI1-NEXT: shrl %cl, %r14d ; X64-BMI1-NEXT: movl %r14d, %edi ; X64-BMI1-NEXT: callq use32@PLT -; X64-BMI1-NEXT: movl %ebx, %eax +; X64-BMI1-NEXT: andl %ebp, %r14d +; X64-BMI1-NEXT: movl %ebx, %edi +; X64-BMI1-NEXT: callq use32@PLT +; X64-BMI1-NEXT: movl %r14d, %eax ; X64-BMI1-NEXT: popq %rbx ; X64-BMI1-NEXT: popq %r14 ; X64-BMI1-NEXT: popq %rbp @@ -5000,17 +5000,17 @@ ; X64-NOBMI-NEXT: pushq %rbx ; X64-NOBMI-NEXT: pushq %rax ; X64-NOBMI-NEXT: movq %rsi, %rcx -; X64-NOBMI-NEXT: movq %rdi, %r14 +; X64-NOBMI-NEXT: movq %rdi, %rbx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-NOBMI-NEXT: shrq %cl, %r14 +; X64-NOBMI-NEXT: shrq %cl, %rbx ; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movq $-1, %rbx +; X64-NOBMI-NEXT: movq $-1, %r14 ; X64-NOBMI-NEXT: movl %edx, %ecx -; X64-NOBMI-NEXT: shrq %cl, %rbx -; X64-NOBMI-NEXT: movq %rbx, %rdi +; X64-NOBMI-NEXT: shrq %cl, %r14 +; X64-NOBMI-NEXT: movq %r14, %rdi ; X64-NOBMI-NEXT: callq use64@PLT -; X64-NOBMI-NEXT: andq %r14, %rbx -; X64-NOBMI-NEXT: movq %rbx, %rax +; X64-NOBMI-NEXT: andq %rbx, %r14 +; X64-NOBMI-NEXT: movq %r14, %rax ; X64-NOBMI-NEXT: addq $8, %rsp ; X64-NOBMI-NEXT: popq %rbx ; X64-NOBMI-NEXT: popq %r14 @@ -5022,17 +5022,17 @@ ; X64-BMI1-NEXT: pushq %rbx ; X64-BMI1-NEXT: pushq %rax ; X64-BMI1-NEXT: movq %rsi, %rcx -; X64-BMI1-NEXT: movq %rdi, %r14 +; X64-BMI1-NEXT: movq %rdi, %rbx ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-BMI1-NEXT: shrq %cl, %r14 +; X64-BMI1-NEXT: shrq %cl, %rbx ; X64-BMI1-NEXT: negb %dl -; X64-BMI1-NEXT: movq $-1, %rbx +; X64-BMI1-NEXT: movq $-1, %r14 ; X64-BMI1-NEXT: movl %edx, %ecx -; X64-BMI1-NEXT: shrq %cl, %rbx -; X64-BMI1-NEXT: movq %rbx, %rdi +; X64-BMI1-NEXT: shrq %cl, %r14 +; X64-BMI1-NEXT: movq %r14, %rdi ; X64-BMI1-NEXT: callq use64@PLT -; X64-BMI1-NEXT: andq %r14, %rbx -; X64-BMI1-NEXT: movq %rbx, %rax +; X64-BMI1-NEXT: andq %rbx, %r14 +; X64-BMI1-NEXT: movq %r14, %rax ; X64-BMI1-NEXT: addq $8, %rsp ; X64-BMI1-NEXT: popq %rbx ; X64-BMI1-NEXT: popq %r14 @@ -5206,17 +5206,17 @@ ; X64-NOBMI-NEXT: pushq %rbx ; X64-NOBMI-NEXT: pushq %rax ; X64-NOBMI-NEXT: movl %esi, %ecx -; X64-NOBMI-NEXT: movq %rdi, %r14 +; X64-NOBMI-NEXT: movq %rdi, %rbx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NOBMI-NEXT: shrq %cl, %r14 +; X64-NOBMI-NEXT: shrq %cl, %rbx ; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movq $-1, %rbx +; X64-NOBMI-NEXT: movq $-1, %r14 ; X64-NOBMI-NEXT: movl %edx, %ecx -; X64-NOBMI-NEXT: shrq %cl, %rbx -; X64-NOBMI-NEXT: movq %rbx, %rdi +; X64-NOBMI-NEXT: shrq %cl, %r14 +; X64-NOBMI-NEXT: movq %r14, %rdi ; X64-NOBMI-NEXT: callq use64@PLT -; X64-NOBMI-NEXT: andq %r14, %rbx -; X64-NOBMI-NEXT: movq %rbx, %rax +; X64-NOBMI-NEXT: andq %rbx, %r14 +; X64-NOBMI-NEXT: movq %r14, %rax ; X64-NOBMI-NEXT: addq $8, %rsp ; X64-NOBMI-NEXT: popq %rbx ; X64-NOBMI-NEXT: popq %r14 @@ -5228,17 +5228,17 @@ ; X64-BMI1-NEXT: pushq %rbx ; X64-BMI1-NEXT: pushq %rax ; X64-BMI1-NEXT: movl %esi, %ecx -; X64-BMI1-NEXT: movq %rdi, %r14 +; X64-BMI1-NEXT: movq %rdi, %rbx ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-BMI1-NEXT: shrq %cl, %r14 +; X64-BMI1-NEXT: shrq %cl, %rbx ; X64-BMI1-NEXT: negb %dl -; X64-BMI1-NEXT: movq $-1, %rbx +; X64-BMI1-NEXT: movq $-1, %r14 ; X64-BMI1-NEXT: movl %edx, %ecx -; X64-BMI1-NEXT: shrq %cl, %rbx -; X64-BMI1-NEXT: movq %rbx, %rdi +; X64-BMI1-NEXT: shrq %cl, %r14 +; X64-BMI1-NEXT: movq %r14, %rdi ; X64-BMI1-NEXT: callq use64@PLT -; X64-BMI1-NEXT: andq %r14, %rbx -; X64-BMI1-NEXT: movq %rbx, %rax +; X64-BMI1-NEXT: andq %rbx, %r14 +; X64-BMI1-NEXT: movq %r14, %rax ; X64-BMI1-NEXT: addq $8, %rsp ; X64-BMI1-NEXT: popq %rbx ; X64-BMI1-NEXT: popq %r14 @@ -5838,17 +5838,17 @@ ; X64-NOBMI-NEXT: pushq %rbx ; X64-NOBMI-NEXT: pushq %rax ; X64-NOBMI-NEXT: movq %rsi, %rcx -; X64-NOBMI-NEXT: movq %rdi, %r14 +; X64-NOBMI-NEXT: movq %rdi, %rbx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-NOBMI-NEXT: shrq %cl, %r14 +; X64-NOBMI-NEXT: shrq %cl, %rbx ; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movq $-1, %rbx +; X64-NOBMI-NEXT: movq $-1, %r14 ; X64-NOBMI-NEXT: movl %edx, %ecx -; X64-NOBMI-NEXT: shrq %cl, %rbx -; X64-NOBMI-NEXT: movq %rbx, %rdi +; X64-NOBMI-NEXT: shrq %cl, %r14 +; X64-NOBMI-NEXT: movq %r14, %rdi ; X64-NOBMI-NEXT: callq use64@PLT -; X64-NOBMI-NEXT: andq %r14, %rbx -; X64-NOBMI-NEXT: movq %rbx, %rax +; X64-NOBMI-NEXT: andq %rbx, %r14 +; X64-NOBMI-NEXT: movq %r14, %rax ; X64-NOBMI-NEXT: addq $8, %rsp ; X64-NOBMI-NEXT: popq %rbx ; X64-NOBMI-NEXT: popq %r14 @@ -5860,17 +5860,17 @@ ; X64-BMI1-NEXT: pushq %rbx ; X64-BMI1-NEXT: pushq %rax ; X64-BMI1-NEXT: movq %rsi, %rcx -; X64-BMI1-NEXT: movq %rdi, %r14 +; X64-BMI1-NEXT: movq %rdi, %rbx ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-BMI1-NEXT: shrq %cl, %r14 +; X64-BMI1-NEXT: shrq %cl, %rbx ; X64-BMI1-NEXT: negb %dl -; X64-BMI1-NEXT: movq $-1, %rbx +; X64-BMI1-NEXT: movq $-1, %r14 ; X64-BMI1-NEXT: movl %edx, %ecx -; X64-BMI1-NEXT: shrq %cl, %rbx -; X64-BMI1-NEXT: movq %rbx, %rdi +; X64-BMI1-NEXT: shrq %cl, %r14 +; X64-BMI1-NEXT: movq %r14, %rdi ; X64-BMI1-NEXT: callq use64@PLT -; X64-BMI1-NEXT: andq %r14, %rbx -; X64-BMI1-NEXT: movq %rbx, %rax +; X64-BMI1-NEXT: andq %rbx, %r14 +; X64-BMI1-NEXT: movq %r14, %rax ; X64-BMI1-NEXT: addq $8, %rsp ; X64-BMI1-NEXT: popq %rbx ; X64-BMI1-NEXT: popq %r14 @@ -6058,20 +6058,20 @@ ; X64-NOBMI-NEXT: pushq %r15 ; X64-NOBMI-NEXT: pushq %r14 ; X64-NOBMI-NEXT: pushq %rbx -; X64-NOBMI-NEXT: movq %rsi, %r14 -; X64-NOBMI-NEXT: movq %rdi, %r15 -; X64-NOBMI-NEXT: movl %r14d, %ecx -; X64-NOBMI-NEXT: shrq %cl, %r15 +; X64-NOBMI-NEXT: movq %rsi, %rbx +; X64-NOBMI-NEXT: movq %rdi, %r14 +; X64-NOBMI-NEXT: movl %ebx, %ecx +; X64-NOBMI-NEXT: shrq %cl, %r14 ; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movq $-1, %rbx +; X64-NOBMI-NEXT: movq $-1, %r15 ; X64-NOBMI-NEXT: movl %edx, %ecx -; X64-NOBMI-NEXT: shrq %cl, %rbx -; X64-NOBMI-NEXT: movq %rbx, %rdi +; X64-NOBMI-NEXT: shrq %cl, %r15 +; X64-NOBMI-NEXT: movq %r15, %rdi ; X64-NOBMI-NEXT: callq use64@PLT -; X64-NOBMI-NEXT: andq %r15, %rbx -; X64-NOBMI-NEXT: movq %r14, %rdi +; X64-NOBMI-NEXT: andq %r14, %r15 +; X64-NOBMI-NEXT: movq %rbx, %rdi ; X64-NOBMI-NEXT: callq use64@PLT -; X64-NOBMI-NEXT: movq %rbx, %rax +; X64-NOBMI-NEXT: movq %r15, %rax ; X64-NOBMI-NEXT: popq %rbx ; X64-NOBMI-NEXT: popq %r14 ; X64-NOBMI-NEXT: popq %r15 @@ -6082,20 +6082,20 @@ ; X64-BMI1-NEXT: pushq %r15 ; X64-BMI1-NEXT: pushq %r14 ; X64-BMI1-NEXT: pushq %rbx -; X64-BMI1-NEXT: movq %rsi, %r14 -; X64-BMI1-NEXT: movq %rdi, %r15 -; X64-BMI1-NEXT: movl %r14d, %ecx -; X64-BMI1-NEXT: shrq %cl, %r15 +; X64-BMI1-NEXT: movq %rsi, %rbx +; X64-BMI1-NEXT: movq %rdi, %r14 +; X64-BMI1-NEXT: movl %ebx, %ecx +; X64-BMI1-NEXT: shrq %cl, %r14 ; X64-BMI1-NEXT: negb %dl -; X64-BMI1-NEXT: movq $-1, %rbx +; X64-BMI1-NEXT: movq $-1, %r15 ; X64-BMI1-NEXT: movl %edx, %ecx -; X64-BMI1-NEXT: shrq %cl, %rbx -; X64-BMI1-NEXT: movq %rbx, %rdi +; X64-BMI1-NEXT: shrq %cl, %r15 +; X64-BMI1-NEXT: movq %r15, %rdi ; X64-BMI1-NEXT: callq use64@PLT -; X64-BMI1-NEXT: andq %r15, %rbx -; X64-BMI1-NEXT: movq %r14, %rdi +; X64-BMI1-NEXT: andq %r14, %r15 +; X64-BMI1-NEXT: movq %rbx, %rdi ; X64-BMI1-NEXT: callq use64@PLT -; X64-BMI1-NEXT: movq %rbx, %rax +; X64-BMI1-NEXT: movq %r15, %rax ; X64-BMI1-NEXT: popq %rbx ; X64-BMI1-NEXT: popq %r14 ; X64-BMI1-NEXT: popq %r15 diff --git a/llvm/test/CodeGen/X86/flt-rounds.ll b/llvm/test/CodeGen/X86/flt-rounds.ll --- a/llvm/test/CodeGen/X86/flt-rounds.ll +++ b/llvm/test/CodeGen/X86/flt-rounds.ll @@ -116,14 +116,14 @@ ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx ; X64-NEXT: shrl $9, %ecx ; X64-NEXT: andb $6, %cl -; X64-NEXT: movl $45, %r14d +; X64-NEXT: movl $45, %ebx ; X64-NEXT: movl $45, %eax ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrl %cl, %eax ; X64-NEXT: andl $3, %eax -; X64-NEXT: xorl %ebx, %ebx +; X64-NEXT: xorl %r14d, %r14d ; X64-NEXT: cmpl $3, %eax -; X64-NEXT: setne %bl +; X64-NEXT: setne %r14b ; X64-NEXT: xorl %edi, %edi ; X64-NEXT: callq fesetround ; X64-NEXT: fnstcw {{[0-9]+}}(%rsp) @@ -134,9 +134,9 @@ ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrl %cl, %eax ; X64-NEXT: andl $3, %eax -; X64-NEXT: leal 1(%rbx), %ebp +; X64-NEXT: leal 1(%r14), %ebp ; X64-NEXT: cmpl $1, %eax -; X64-NEXT: cmovel %ebx, %ebp +; X64-NEXT: cmovel %r14d, %ebp ; X64-NEXT: movl $3072, %edi # imm = 0xC00 ; X64-NEXT: callq fesetround ; X64-NEXT: fnstcw {{[0-9]+}}(%rsp) @@ -156,10 +156,10 @@ ; X64-NEXT: shrl $9, %ecx ; X64-NEXT: andb $6, %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %r14d -; X64-NEXT: andl $3, %r14d +; X64-NEXT: shrl %cl, %ebx +; X64-NEXT: andl $3, %ebx ; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: cmpl $2, %r14d +; X64-NEXT: cmpl $2, %ebx ; X64-NEXT: setne %cl ; X64-NEXT: negl %ecx ; X64-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/fma-commute-loop.ll b/llvm/test/CodeGen/X86/fma-commute-loop.ll --- a/llvm/test/CodeGen/X86/fma-commute-loop.ll +++ b/llvm/test/CodeGen/X86/fma-commute-loop.ll @@ -12,17 +12,17 @@ ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; CHECK-NEXT: leaq (%rbx,%r14,8), %r14 -; CHECK-NEXT: leaq (%rbx,%r15,8), %r15 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; CHECK-NEXT: leaq (%r12,%r14,8), %r14 +; CHECK-NEXT: leaq (%r12,%r15,8), %r15 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: xorl %r12d, %r12d ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; CHECK-NEXT: addq %r12, %r13 -; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %r12 +; CHECK-NEXT: addq %rbx, %r13 +; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rbx ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 @@ -33,17 +33,17 @@ ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovupd (%rax,%r11,8), %zmm6 ; CHECK-NEXT: vmovupd (%rax,%r13,8), %zmm7 -; CHECK-NEXT: vmovupd (%rax,%r12,8), %zmm8 -; CHECK-NEXT: vbroadcastsd (%r15,%rbx,8), %zmm9 +; CHECK-NEXT: vmovupd (%rax,%rbx,8), %zmm8 +; CHECK-NEXT: vbroadcastsd (%r15,%r12,8), %zmm9 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm6 * zmm9) + zmm0 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm7 * zmm9) + zmm1 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm8 * zmm9) + zmm2 -; CHECK-NEXT: vbroadcastsd (%r14,%rbx,8), %zmm9 +; CHECK-NEXT: vbroadcastsd (%r14,%r12,8), %zmm9 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm3 = (zmm9 * zmm6) + zmm3 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm4 = (zmm9 * zmm7) + zmm4 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm5 = (zmm8 * zmm9) + zmm5 -; CHECK-NEXT: incq %rbx -; CHECK-NEXT: cmpq %rbx, %r10 +; CHECK-NEXT: incq %r12 +; CHECK-NEXT: cmpq %r12, %r10 ; CHECK-NEXT: jne LBB0_1 ; CHECK-NEXT: ## %bb.2: ## %bb51 ; CHECK-NEXT: vmovapd %zmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/fmaddsub-combine.ll b/llvm/test/CodeGen/X86/fmaddsub-combine.ll --- a/llvm/test/CodeGen/X86/fmaddsub-combine.ll +++ b/llvm/test/CodeGen/X86/fmaddsub-combine.ll @@ -569,14 +569,14 @@ ; NOFMA-NEXT: vaddss %xmm4, %xmm3, %xmm3 ; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm4 ; NOFMA-NEXT: vextractf128 $1, %ymm2, %xmm5 -; NOFMA-NEXT: vaddss %xmm5, %xmm4, %xmm8 +; NOFMA-NEXT: vaddss %xmm5, %xmm4, %xmm6 ; NOFMA-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] -; NOFMA-NEXT: vaddss %xmm6, %xmm7, %xmm9 -; NOFMA-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] -; NOFMA-NEXT: vmovshdup {{.*#+}} xmm6 = xmm2[1,1,3,3] -; NOFMA-NEXT: vsubss %xmm6, %xmm7, %xmm6 -; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[2,3] +; NOFMA-NEXT: vpermilpd {{.*#+}} xmm8 = xmm5[1,0] +; NOFMA-NEXT: vaddss %xmm7, %xmm8, %xmm7 +; NOFMA-NEXT: vmovshdup {{.*#+}} xmm8 = xmm0[1,1,3,3] +; NOFMA-NEXT: vmovshdup {{.*#+}} xmm9 = xmm2[1,1,3,3] +; NOFMA-NEXT: vsubss %xmm9, %xmm8, %xmm8 +; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[2,3] ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] @@ -585,8 +585,8 @@ ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm2 = xmm5[1,1,3,3] ; NOFMA-NEXT: vsubss %xmm2, %xmm1, %xmm1 -; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[2,3] -; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0],xmm1[3] +; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[2,3] +; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3] ; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[3,3,3,3] ; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm5[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2 @@ -694,31 +694,31 @@ ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 ; NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; NOFMA-NEXT: vaddss %xmm4, %xmm0, %xmm8 +; NOFMA-NEXT: vaddss %xmm4, %xmm0, %xmm2 ; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; NOFMA-NEXT: vpermilpd {{.*#+}} xmm6 = xmm4[1,0] -; NOFMA-NEXT: vaddss %xmm6, %xmm3, %xmm9 +; NOFMA-NEXT: vaddss %xmm6, %xmm3, %xmm3 ; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm6 ; NOFMA-NEXT: vextractf128 $1, %ymm4, %xmm7 -; NOFMA-NEXT: vaddss %xmm7, %xmm6, %xmm10 -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm6[1,0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm7[1,0] -; NOFMA-NEXT: vaddss %xmm2, %xmm3, %xmm2 -; NOFMA-NEXT: vinsertps {{.*#+}} xmm11 = xmm10[0,1],xmm2[0],xmm10[3] -; NOFMA-NEXT: vaddss %xmm5, %xmm1, %xmm10 -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm5[1,0] -; NOFMA-NEXT: vaddss %xmm2, %xmm3, %xmm12 -; NOFMA-NEXT: vextractf128 $1, %ymm1, %xmm14 -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm13 = xmm14[1,0] -; NOFMA-NEXT: vextractf128 $1, %ymm5, %xmm15 -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm15[1,0] -; NOFMA-NEXT: vaddss %xmm3, %xmm13, %xmm13 -; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; NOFMA-NEXT: vmovshdup {{.*#+}} xmm2 = xmm4[1,1,3,3] -; NOFMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[2,3] -; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] +; NOFMA-NEXT: vaddss %xmm7, %xmm6, %xmm8 +; NOFMA-NEXT: vpermilpd {{.*#+}} xmm9 = xmm6[1,0] +; NOFMA-NEXT: vpermilpd {{.*#+}} xmm10 = xmm7[1,0] +; NOFMA-NEXT: vaddss %xmm10, %xmm9, %xmm9 +; NOFMA-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1],xmm9[0],xmm8[3] +; NOFMA-NEXT: vaddss %xmm5, %xmm1, %xmm9 +; NOFMA-NEXT: vpermilpd {{.*#+}} xmm10 = xmm1[1,0] +; NOFMA-NEXT: vpermilpd {{.*#+}} xmm11 = xmm5[1,0] +; NOFMA-NEXT: vaddss %xmm11, %xmm10, %xmm10 +; NOFMA-NEXT: vextractf128 $1, %ymm1, %xmm11 +; NOFMA-NEXT: vpermilpd {{.*#+}} xmm12 = xmm11[1,0] +; NOFMA-NEXT: vextractf128 $1, %ymm5, %xmm13 +; NOFMA-NEXT: vpermilpd {{.*#+}} xmm14 = xmm13[1,0] +; NOFMA-NEXT: vaddss %xmm14, %xmm12, %xmm12 +; NOFMA-NEXT: vmovshdup {{.*#+}} xmm14 = xmm0[1,1,3,3] +; NOFMA-NEXT: vmovshdup {{.*#+}} xmm15 = xmm4[1,1,3,3] +; NOFMA-NEXT: vsubss %xmm15, %xmm14, %xmm14 +; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[2,3] +; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm3, %xmm0, %xmm0 @@ -726,22 +726,22 @@ ; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm6[3,3,3,3] ; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm7[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2 -; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[0] +; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[0] ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm5[1,1,3,3] ; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[2,3] -; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm12[0],xmm3[3] +; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[2,3] +; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm10[0],xmm3[3] ; NOFMA-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; NOFMA-NEXT: vpermilps {{.*#+}} xmm4 = xmm5[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm4, %xmm1, %xmm1 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm14[1,1,3,3] -; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm15[1,1,3,3] +; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm11[1,1,3,3] +; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm13[1,1,3,3] ; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm13[0,0] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm4 = xmm14[3,3,3,3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm5 = xmm15[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm12[0,0] +; NOFMA-NEXT: vpermilps {{.*#+}} xmm4 = xmm11[3,3,3,3] +; NOFMA-NEXT: vpermilps {{.*#+}} xmm5 = xmm13[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm5, %xmm4, %xmm4 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] ; NOFMA-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -841,11 +841,11 @@ ; NOFMA-NEXT: vaddsd %xmm4, %xmm0, %xmm2 ; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm3 ; NOFMA-NEXT: vextractf128 $1, %ymm4, %xmm6 -; NOFMA-NEXT: vaddsd %xmm6, %xmm3, %xmm9 +; NOFMA-NEXT: vaddsd %xmm6, %xmm3, %xmm7 ; NOFMA-NEXT: vaddsd %xmm5, %xmm1, %xmm8 ; NOFMA-NEXT: vextractf128 $1, %ymm1, %xmm1 ; NOFMA-NEXT: vextractf128 $1, %ymm5, %xmm5 -; NOFMA-NEXT: vaddsd %xmm5, %xmm1, %xmm7 +; NOFMA-NEXT: vaddsd %xmm5, %xmm1, %xmm9 ; NOFMA-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; NOFMA-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] ; NOFMA-NEXT: vsubsd %xmm4, %xmm0, %xmm0 @@ -853,11 +853,11 @@ ; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm6[1,0] ; NOFMA-NEXT: vsubsd %xmm3, %xmm2, %xmm2 -; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm9[0],xmm2[0] +; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm7[0],xmm2[0] ; NOFMA-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm5[1,0] ; NOFMA-NEXT: vsubsd %xmm3, %xmm1, %xmm1 -; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm7[0],xmm1[0] +; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm9[0],xmm1[0] ; NOFMA-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NOFMA-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 ; NOFMA-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fmaxnum.ll b/llvm/test/CodeGen/X86/fmaxnum.ll --- a/llvm/test/CodeGen/X86/fmaxnum.ll +++ b/llvm/test/CodeGen/X86/fmaxnum.ll @@ -322,29 +322,29 @@ ; ; SSE4-LABEL: test_intrinsic_fmax_v16f32: ; SSE4: # %bb.0: -; SSE4-NEXT: movaps %xmm3, %xmm8 -; SSE4-NEXT: movaps %xmm2, %xmm9 -; SSE4-NEXT: movaps %xmm1, %xmm2 -; SSE4-NEXT: movaps %xmm4, %xmm10 -; SSE4-NEXT: maxps %xmm0, %xmm10 +; SSE4-NEXT: movaps %xmm3, %xmm11 +; SSE4-NEXT: movaps %xmm2, %xmm10 +; SSE4-NEXT: movaps %xmm1, %xmm9 +; SSE4-NEXT: movaps %xmm4, %xmm8 +; SSE4-NEXT: maxps %xmm0, %xmm8 ; SSE4-NEXT: cmpunordps %xmm0, %xmm0 -; SSE4-NEXT: blendvps %xmm0, %xmm4, %xmm10 +; SSE4-NEXT: blendvps %xmm0, %xmm4, %xmm8 ; SSE4-NEXT: movaps %xmm5, %xmm1 -; SSE4-NEXT: maxps %xmm2, %xmm1 -; SSE4-NEXT: cmpunordps %xmm2, %xmm2 -; SSE4-NEXT: movaps %xmm2, %xmm0 -; SSE4-NEXT: blendvps %xmm0, %xmm5, %xmm1 -; SSE4-NEXT: movaps %xmm6, %xmm2 -; SSE4-NEXT: maxps %xmm9, %xmm2 +; SSE4-NEXT: maxps %xmm9, %xmm1 ; SSE4-NEXT: cmpunordps %xmm9, %xmm9 ; SSE4-NEXT: movaps %xmm9, %xmm0 +; SSE4-NEXT: blendvps %xmm0, %xmm5, %xmm1 +; SSE4-NEXT: movaps %xmm6, %xmm2 +; SSE4-NEXT: maxps %xmm10, %xmm2 +; SSE4-NEXT: cmpunordps %xmm10, %xmm10 +; SSE4-NEXT: movaps %xmm10, %xmm0 ; SSE4-NEXT: blendvps %xmm0, %xmm6, %xmm2 ; SSE4-NEXT: movaps %xmm7, %xmm3 -; SSE4-NEXT: maxps %xmm8, %xmm3 -; SSE4-NEXT: cmpunordps %xmm8, %xmm8 -; SSE4-NEXT: movaps %xmm8, %xmm0 +; SSE4-NEXT: maxps %xmm11, %xmm3 +; SSE4-NEXT: cmpunordps %xmm11, %xmm11 +; SSE4-NEXT: movaps %xmm11, %xmm0 ; SSE4-NEXT: blendvps %xmm0, %xmm7, %xmm3 -; SSE4-NEXT: movaps %xmm10, %xmm0 +; SSE4-NEXT: movaps %xmm8, %xmm0 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test_intrinsic_fmax_v16f32: @@ -471,29 +471,29 @@ ; ; SSE4-LABEL: test_intrinsic_fmax_v8f64: ; SSE4: # %bb.0: -; SSE4-NEXT: movapd %xmm3, %xmm8 -; SSE4-NEXT: movapd %xmm2, %xmm9 -; SSE4-NEXT: movapd %xmm1, %xmm2 -; SSE4-NEXT: movapd %xmm4, %xmm10 -; SSE4-NEXT: maxpd %xmm0, %xmm10 +; SSE4-NEXT: movapd %xmm3, %xmm11 +; SSE4-NEXT: movapd %xmm2, %xmm10 +; SSE4-NEXT: movapd %xmm1, %xmm9 +; SSE4-NEXT: movapd %xmm4, %xmm8 +; SSE4-NEXT: maxpd %xmm0, %xmm8 ; SSE4-NEXT: cmpunordpd %xmm0, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm10 +; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm8 ; SSE4-NEXT: movapd %xmm5, %xmm1 -; SSE4-NEXT: maxpd %xmm2, %xmm1 -; SSE4-NEXT: cmpunordpd %xmm2, %xmm2 -; SSE4-NEXT: movapd %xmm2, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE4-NEXT: movapd %xmm6, %xmm2 -; SSE4-NEXT: maxpd %xmm9, %xmm2 +; SSE4-NEXT: maxpd %xmm9, %xmm1 ; SSE4-NEXT: cmpunordpd %xmm9, %xmm9 ; SSE4-NEXT: movapd %xmm9, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE4-NEXT: movapd %xmm6, %xmm2 +; SSE4-NEXT: maxpd %xmm10, %xmm2 +; SSE4-NEXT: cmpunordpd %xmm10, %xmm10 +; SSE4-NEXT: movapd %xmm10, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm2 ; SSE4-NEXT: movapd %xmm7, %xmm3 -; SSE4-NEXT: maxpd %xmm8, %xmm3 -; SSE4-NEXT: cmpunordpd %xmm8, %xmm8 -; SSE4-NEXT: movapd %xmm8, %xmm0 +; SSE4-NEXT: maxpd %xmm11, %xmm3 +; SSE4-NEXT: cmpunordpd %xmm11, %xmm11 +; SSE4-NEXT: movapd %xmm11, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm3 -; SSE4-NEXT: movapd %xmm10, %xmm0 +; SSE4-NEXT: movapd %xmm8, %xmm0 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test_intrinsic_fmax_v8f64: diff --git a/llvm/test/CodeGen/X86/fminnum.ll b/llvm/test/CodeGen/X86/fminnum.ll --- a/llvm/test/CodeGen/X86/fminnum.ll +++ b/llvm/test/CodeGen/X86/fminnum.ll @@ -322,29 +322,29 @@ ; ; SSE4-LABEL: test_intrinsic_fmin_v16f32: ; SSE4: # %bb.0: -; SSE4-NEXT: movaps %xmm3, %xmm8 -; SSE4-NEXT: movaps %xmm2, %xmm9 -; SSE4-NEXT: movaps %xmm1, %xmm2 -; SSE4-NEXT: movaps %xmm4, %xmm10 -; SSE4-NEXT: minps %xmm0, %xmm10 +; SSE4-NEXT: movaps %xmm3, %xmm11 +; SSE4-NEXT: movaps %xmm2, %xmm10 +; SSE4-NEXT: movaps %xmm1, %xmm9 +; SSE4-NEXT: movaps %xmm4, %xmm8 +; SSE4-NEXT: minps %xmm0, %xmm8 ; SSE4-NEXT: cmpunordps %xmm0, %xmm0 -; SSE4-NEXT: blendvps %xmm0, %xmm4, %xmm10 +; SSE4-NEXT: blendvps %xmm0, %xmm4, %xmm8 ; SSE4-NEXT: movaps %xmm5, %xmm1 -; SSE4-NEXT: minps %xmm2, %xmm1 -; SSE4-NEXT: cmpunordps %xmm2, %xmm2 -; SSE4-NEXT: movaps %xmm2, %xmm0 -; SSE4-NEXT: blendvps %xmm0, %xmm5, %xmm1 -; SSE4-NEXT: movaps %xmm6, %xmm2 -; SSE4-NEXT: minps %xmm9, %xmm2 +; SSE4-NEXT: minps %xmm9, %xmm1 ; SSE4-NEXT: cmpunordps %xmm9, %xmm9 ; SSE4-NEXT: movaps %xmm9, %xmm0 +; SSE4-NEXT: blendvps %xmm0, %xmm5, %xmm1 +; SSE4-NEXT: movaps %xmm6, %xmm2 +; SSE4-NEXT: minps %xmm10, %xmm2 +; SSE4-NEXT: cmpunordps %xmm10, %xmm10 +; SSE4-NEXT: movaps %xmm10, %xmm0 ; SSE4-NEXT: blendvps %xmm0, %xmm6, %xmm2 ; SSE4-NEXT: movaps %xmm7, %xmm3 -; SSE4-NEXT: minps %xmm8, %xmm3 -; SSE4-NEXT: cmpunordps %xmm8, %xmm8 -; SSE4-NEXT: movaps %xmm8, %xmm0 +; SSE4-NEXT: minps %xmm11, %xmm3 +; SSE4-NEXT: cmpunordps %xmm11, %xmm11 +; SSE4-NEXT: movaps %xmm11, %xmm0 ; SSE4-NEXT: blendvps %xmm0, %xmm7, %xmm3 -; SSE4-NEXT: movaps %xmm10, %xmm0 +; SSE4-NEXT: movaps %xmm8, %xmm0 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test_intrinsic_fmin_v16f32: @@ -471,29 +471,29 @@ ; ; SSE4-LABEL: test_intrinsic_fmin_v8f64: ; SSE4: # %bb.0: -; SSE4-NEXT: movapd %xmm3, %xmm8 -; SSE4-NEXT: movapd %xmm2, %xmm9 -; SSE4-NEXT: movapd %xmm1, %xmm2 -; SSE4-NEXT: movapd %xmm4, %xmm10 -; SSE4-NEXT: minpd %xmm0, %xmm10 +; SSE4-NEXT: movapd %xmm3, %xmm11 +; SSE4-NEXT: movapd %xmm2, %xmm10 +; SSE4-NEXT: movapd %xmm1, %xmm9 +; SSE4-NEXT: movapd %xmm4, %xmm8 +; SSE4-NEXT: minpd %xmm0, %xmm8 ; SSE4-NEXT: cmpunordpd %xmm0, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm10 +; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm8 ; SSE4-NEXT: movapd %xmm5, %xmm1 -; SSE4-NEXT: minpd %xmm2, %xmm1 -; SSE4-NEXT: cmpunordpd %xmm2, %xmm2 -; SSE4-NEXT: movapd %xmm2, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE4-NEXT: movapd %xmm6, %xmm2 -; SSE4-NEXT: minpd %xmm9, %xmm2 +; SSE4-NEXT: minpd %xmm9, %xmm1 ; SSE4-NEXT: cmpunordpd %xmm9, %xmm9 ; SSE4-NEXT: movapd %xmm9, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE4-NEXT: movapd %xmm6, %xmm2 +; SSE4-NEXT: minpd %xmm10, %xmm2 +; SSE4-NEXT: cmpunordpd %xmm10, %xmm10 +; SSE4-NEXT: movapd %xmm10, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm2 ; SSE4-NEXT: movapd %xmm7, %xmm3 -; SSE4-NEXT: minpd %xmm8, %xmm3 -; SSE4-NEXT: cmpunordpd %xmm8, %xmm8 -; SSE4-NEXT: movapd %xmm8, %xmm0 +; SSE4-NEXT: minpd %xmm11, %xmm3 +; SSE4-NEXT: cmpunordpd %xmm11, %xmm11 +; SSE4-NEXT: movapd %xmm11, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm3 -; SSE4-NEXT: movapd %xmm10, %xmm0 +; SSE4-NEXT: movapd %xmm8, %xmm0 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test_intrinsic_fmin_v8f64: diff --git a/llvm/test/CodeGen/X86/fp-stack-2results.ll b/llvm/test/CodeGen/X86/fp-stack-2results.ll --- a/llvm/test/CodeGen/X86/fp-stack-2results.ll +++ b/llvm/test/CodeGen/X86/fp-stack-2results.ll @@ -68,11 +68,11 @@ ; x86_64-NEXT: .cfi_def_cfa_offset 32 ; x86_64-NEXT: .cfi_offset %rbx, -24 ; x86_64-NEXT: .cfi_offset %r14, -16 -; x86_64-NEXT: movq %rsi, %r14 -; x86_64-NEXT: movq %rdi, %rbx +; x86_64-NEXT: movq %rsi, %rbx +; x86_64-NEXT: movq %rdi, %r14 ; x86_64-NEXT: callq test@PLT -; x86_64-NEXT: fstpt (%rbx) ; x86_64-NEXT: fstpt (%r14) +; x86_64-NEXT: fstpt (%rbx) ; x86_64-NEXT: addq $8, %rsp ; x86_64-NEXT: .cfi_def_cfa_offset 24 ; x86_64-NEXT: popq %rbx @@ -121,12 +121,12 @@ ; x86_64-NEXT: .cfi_def_cfa_offset 32 ; x86_64-NEXT: .cfi_offset %rbx, -24 ; x86_64-NEXT: .cfi_offset %r14, -16 -; x86_64-NEXT: movq %rsi, %r14 -; x86_64-NEXT: movq %rdi, %rbx +; x86_64-NEXT: movq %rsi, %rbx +; x86_64-NEXT: movq %rdi, %r14 ; x86_64-NEXT: callq test@PLT ; x86_64-NEXT: fxch %st(1) -; x86_64-NEXT: fstpt (%rbx) ; x86_64-NEXT: fstpt (%r14) +; x86_64-NEXT: fstpt (%rbx) ; x86_64-NEXT: addq $8, %rsp ; x86_64-NEXT: .cfi_def_cfa_offset 24 ; x86_64-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll --- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll @@ -1175,12 +1175,12 @@ ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movq %rsi, %r14 -; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: movq %rdi, %r14 ; CHECK-NEXT: callq __eqtf2@PLT ; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: cmovneq %r14, %rbx -; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: cmovneq %rbx, %r14 +; CHECK-NEXT: movq %r14, %rax ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 @@ -1221,12 +1221,12 @@ ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movq %rsi, %r14 -; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: movq %rdi, %r14 ; CHECK-NEXT: callq __eqtf2@PLT ; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: cmovneq %r14, %rbx -; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: cmovneq %rbx, %r14 +; CHECK-NEXT: movq %r14, %rax ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 @@ -1270,8 +1270,8 @@ ; CHECK-NEXT: subq $32, %rsp ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movq %rsi, %r14 -; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: movq %rdi, %r14 ; CHECK-NEXT: callq __eqtf2@PLT ; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: sete %bpl @@ -1281,8 +1281,8 @@ ; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: setne %al ; CHECK-NEXT: orb %bpl, %al -; CHECK-NEXT: cmoveq %r14, %rbx -; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: cmoveq %rbx, %r14 +; CHECK-NEXT: movq %r14, %rax ; CHECK-NEXT: addq $32, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 @@ -1353,8 +1353,8 @@ ; CHECK-NEXT: subq $32, %rsp ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movq %rsi, %r14 -; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: movq %rdi, %r14 ; CHECK-NEXT: callq __eqtf2@PLT ; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: setne %bpl @@ -1364,8 +1364,8 @@ ; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: sete %al ; CHECK-NEXT: testb %bpl, %al -; CHECK-NEXT: cmoveq %r14, %rbx -; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: cmoveq %rbx, %r14 +; CHECK-NEXT: movq %r14, %rax ; CHECK-NEXT: addq $32, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 diff --git a/llvm/test/CodeGen/X86/fp128-select.ll b/llvm/test/CodeGen/X86/fp128-select.ll --- a/llvm/test/CodeGen/X86/fp128-select.ll +++ b/llvm/test/CodeGen/X86/fp128-select.ll @@ -92,16 +92,16 @@ ; NOSSE-NEXT: .cfi_offset %r14, -32 ; NOSSE-NEXT: .cfi_offset %r15, -24 ; NOSSE-NEXT: .cfi_offset %rbp, -16 -; NOSSE-NEXT: movq %rcx, %r12 -; NOSSE-NEXT: movq %rdx, %rbx -; NOSSE-NEXT: movq %rsi, %r14 -; NOSSE-NEXT: movq %rdi, %r15 +; NOSSE-NEXT: movq %rcx, %r15 +; NOSSE-NEXT: movq %rdx, %r12 +; NOSSE-NEXT: movq %rsi, %rbx +; NOSSE-NEXT: movq %rdi, %r14 ; NOSSE-NEXT: callq __netf2@PLT ; NOSSE-NEXT: movl %eax, %ebp -; NOSSE-NEXT: movq %r15, %rdi -; NOSSE-NEXT: movq %r14, %rsi -; NOSSE-NEXT: movq %rbx, %rdx -; NOSSE-NEXT: movq %r12, %rcx +; NOSSE-NEXT: movq %r14, %rdi +; NOSSE-NEXT: movq %rbx, %rsi +; NOSSE-NEXT: movq %r12, %rdx +; NOSSE-NEXT: movq %r15, %rcx ; NOSSE-NEXT: callq __eqtf2@PLT ; NOSSE-NEXT: movl %eax, %ecx ; NOSSE-NEXT: xorl %eax, %eax @@ -111,8 +111,8 @@ ; NOSSE-NEXT: testl %ebp, %ebp ; NOSSE-NEXT: je .LBB1_2 ; NOSSE-NEXT: # %bb.1: -; NOSSE-NEXT: movq %r15, %rax -; NOSSE-NEXT: movq %r14, %rdx +; NOSSE-NEXT: movq %r14, %rax +; NOSSE-NEXT: movq %rbx, %rdx ; NOSSE-NEXT: .LBB1_2: # %BB2 ; NOSSE-NEXT: popq %rbx ; NOSSE-NEXT: .cfi_def_cfa_offset 40 diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -161,62 +161,62 @@ ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm4, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm9, %xmm9 -; CHECK-NEXT: pcmpeqd %xmm9, %xmm5 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm3, %xmm7 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm6, %xmm1 +; CHECK-NEXT: pxor %xmm6, %xmm6 +; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 +; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295] +; CHECK-NEXT: movdqa %xmm7, %xmm8 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm8 +; CHECK-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; CHECK-NEXT: pand %xmm5, %xmm9 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; CHECK-NEXT: por %xmm9, %xmm1 ; CHECK-NEXT: pand %xmm1, %xmm4 -; CHECK-NEXT: pandn %xmm8, %xmm1 +; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm4, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm4 ; CHECK-NEXT: pxor %xmm0, %xmm4 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm9, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm7 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] ; CHECK-NEXT: pand %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: por %xmm4, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm2 -; CHECK-NEXT: pandn %xmm8, %xmm3 -; CHECK-NEXT: por %xmm2, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968] -; CHECK-NEXT: movdqa %xmm3, %xmm4 -; CHECK-NEXT: pxor %xmm0, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; CHECK-NEXT: por %xmm4, %xmm5 +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pandn %xmm3, %xmm5 +; CHECK-NEXT: por %xmm2, %xmm5 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; CHECK-NEXT: movdqa %xmm5, %xmm3 +; CHECK-NEXT: pxor %xmm0, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-NEXT: pcmpeqd %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 +; CHECK-NEXT: pcmpeqd %xmm6, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm7, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-NEXT: por %xmm2, %xmm4 -; CHECK-NEXT: pand %xmm4, %xmm3 -; CHECK-NEXT: pandn %xmm8, %xmm4 -; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm7, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] +; CHECK-NEXT: pand %xmm4, %xmm8 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: por %xmm8, %xmm3 +; CHECK-NEXT: pand %xmm3, %xmm5 +; CHECK-NEXT: pandn %xmm2, %xmm3 +; CHECK-NEXT: por %xmm5, %xmm3 ; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm6, %xmm4 ; CHECK-NEXT: pcmpgtd %xmm7, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm2, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; CHECK-NEXT: pand %xmm4, %xmm5 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm0 +; CHECK-NEXT: por %xmm5, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm8, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; CHECK-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i64> @@ -328,56 +328,56 @@ ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm4, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm9, %xmm9 -; CHECK-NEXT: pcmpeqd %xmm9, %xmm5 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm3, %xmm7 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm6, %xmm1 +; CHECK-NEXT: pxor %xmm6, %xmm6 +; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 +; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647] +; CHECK-NEXT: movdqa %xmm7, %xmm8 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm8 +; CHECK-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; CHECK-NEXT: pand %xmm5, %xmm9 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; CHECK-NEXT: por %xmm9, %xmm1 ; CHECK-NEXT: pand %xmm1, %xmm4 -; CHECK-NEXT: pandn %xmm8, %xmm1 +; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm4, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm4 ; CHECK-NEXT: pxor %xmm0, %xmm4 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm9, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm7 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] ; CHECK-NEXT: pand %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: por %xmm4, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; CHECK-NEXT: por %xmm4, %xmm5 +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pandn %xmm3, %xmm5 +; CHECK-NEXT: por %xmm2, %xmm5 +; CHECK-NEXT: movdqa %xmm5, %xmm2 +; CHECK-NEXT: pxor %xmm0, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm0, %xmm3 +; CHECK-NEXT: pcmpeqd %xmm0, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-NEXT: pand %xmm3, %xmm2 -; CHECK-NEXT: pandn %xmm8, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 -; CHECK-NEXT: movdqa %xmm3, %xmm2 +; CHECK-NEXT: pand %xmm5, %xmm3 +; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm4 ; CHECK-NEXT: pcmpgtd %xmm0, %xmm4 ; CHECK-NEXT: pcmpeqd %xmm0, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-NEXT: por %xmm2, %xmm4 -; CHECK-NEXT: pand %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm3 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm3 -; CHECK-NEXT: pcmpeqd %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: pand %xmm3, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; CHECK-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i64> @@ -424,10 +424,10 @@ ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 -; CHECK-NEXT: movdqa %xmm3, %xmm2 +; CHECK-NEXT: movdqa %xmm3, %xmm8 ; CHECK-NEXT: pxor %xmm0, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; CHECK-NEXT: pxor %xmm4, %xmm4 @@ -439,9 +439,9 @@ ; CHECK-NEXT: pand %xmm3, %xmm7 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] ; CHECK-NEXT: por %xmm7, %xmm1 -; CHECK-NEXT: pand %xmm1, %xmm2 -; CHECK-NEXT: pandn %xmm8, %xmm1 -; CHECK-NEXT: por %xmm2, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm8 +; CHECK-NEXT: pandn %xmm2, %xmm1 +; CHECK-NEXT: por %xmm8, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; CHECK-NEXT: movdqa %xmm7, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 @@ -454,9 +454,9 @@ ; CHECK-NEXT: por %xmm3, %xmm4 ; CHECK-NEXT: movdqa %xmm7, %xmm3 ; CHECK-NEXT: pand %xmm4, %xmm3 -; CHECK-NEXT: pandn %xmm8, %xmm4 +; CHECK-NEXT: pandn %xmm2, %xmm4 ; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; CHECK-NEXT: movdqa %xmm4, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] @@ -464,23 +464,23 @@ ; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 ; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] ; CHECK-NEXT: pcmpgtd %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] +; CHECK-NEXT: pand %xmm5, %xmm8 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: por %xmm2, %xmm3 +; CHECK-NEXT: por %xmm8, %xmm3 ; CHECK-NEXT: pand %xmm3, %xmm4 -; CHECK-NEXT: pandn %xmm8, %xmm3 +; CHECK-NEXT: pandn %xmm2, %xmm3 ; CHECK-NEXT: por %xmm4, %xmm3 ; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm6, %xmm4 ; CHECK-NEXT: pcmpgtd %xmm7, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; CHECK-NEXT: pand %xmm4, %xmm5 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm4, %xmm0 +; CHECK-NEXT: por %xmm5, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm8, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; CHECK-NEXT: addq $72, %rsp @@ -633,10 +633,10 @@ ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 -; CHECK-NEXT: movdqa %xmm3, %xmm2 +; CHECK-NEXT: movdqa %xmm3, %xmm8 ; CHECK-NEXT: pxor %xmm0, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; CHECK-NEXT: pxor %xmm4, %xmm4 @@ -648,9 +648,9 @@ ; CHECK-NEXT: pand %xmm3, %xmm7 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] ; CHECK-NEXT: por %xmm7, %xmm1 -; CHECK-NEXT: pand %xmm1, %xmm2 -; CHECK-NEXT: pandn %xmm8, %xmm1 -; CHECK-NEXT: por %xmm2, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm8 +; CHECK-NEXT: pandn %xmm2, %xmm1 +; CHECK-NEXT: por %xmm8, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; CHECK-NEXT: movdqa %xmm7, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 @@ -663,7 +663,7 @@ ; CHECK-NEXT: por %xmm3, %xmm4 ; CHECK-NEXT: movdqa %xmm7, %xmm3 ; CHECK-NEXT: pand %xmm4, %xmm3 -; CHECK-NEXT: pandn %xmm8, %xmm4 +; CHECK-NEXT: pandn %xmm2, %xmm4 ; CHECK-NEXT: por %xmm3, %xmm4 ; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 @@ -1131,8 +1131,8 @@ ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __fixdfti@PLT -; CHECK-NEXT: movq %rax, %r14 -; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: movq %rdx, %r14 ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __fixdfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx @@ -1142,20 +1142,20 @@ ; CHECK-NEXT: sbbq $0, %rdi ; CHECK-NEXT: cmovgeq %rcx, %rdx ; CHECK-NEXT: cmovgeq %rsi, %rax -; CHECK-NEXT: cmpq %rsi, %r14 -; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: cmpq %rsi, %rbx +; CHECK-NEXT: movq %r14, %rdi ; CHECK-NEXT: sbbq $0, %rdi -; CHECK-NEXT: cmovlq %rbx, %rcx -; CHECK-NEXT: cmovlq %r14, %rsi -; CHECK-NEXT: movabsq $-9223372036854775808, %r8 # imm = 0x8000000000000000 -; CHECK-NEXT: cmpq %rsi, %r8 -; CHECK-NEXT: movq $-1, %rbx -; CHECK-NEXT: movq $-1, %rdi -; CHECK-NEXT: sbbq %rcx, %rdi -; CHECK-NEXT: cmovgeq %r8, %rsi -; CHECK-NEXT: cmpq %rax, %r8 -; CHECK-NEXT: sbbq %rdx, %rbx -; CHECK-NEXT: cmovgeq %r8, %rax +; CHECK-NEXT: cmovlq %r14, %rcx +; CHECK-NEXT: cmovlq %rbx, %rsi +; CHECK-NEXT: movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000 +; CHECK-NEXT: cmpq %rsi, %rdi +; CHECK-NEXT: movq $-1, %r8 +; CHECK-NEXT: movq $-1, %r9 +; CHECK-NEXT: sbbq %rcx, %r9 +; CHECK-NEXT: cmovgeq %rdi, %rsi +; CHECK-NEXT: cmpq %rax, %rdi +; CHECK-NEXT: sbbq %rdx, %r8 +; CHECK-NEXT: cmovgeq %rdi, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movq %rsi, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -1287,8 +1287,8 @@ ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __fixsfti@PLT -; CHECK-NEXT: movq %rax, %r14 -; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: movq %rdx, %r14 ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __fixsfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx @@ -1298,20 +1298,20 @@ ; CHECK-NEXT: sbbq $0, %rdi ; CHECK-NEXT: cmovgeq %rcx, %rdx ; CHECK-NEXT: cmovgeq %rsi, %rax -; CHECK-NEXT: cmpq %rsi, %r14 -; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: cmpq %rsi, %rbx +; CHECK-NEXT: movq %r14, %rdi ; CHECK-NEXT: sbbq $0, %rdi -; CHECK-NEXT: cmovlq %rbx, %rcx -; CHECK-NEXT: cmovlq %r14, %rsi -; CHECK-NEXT: movabsq $-9223372036854775808, %r8 # imm = 0x8000000000000000 -; CHECK-NEXT: cmpq %rsi, %r8 -; CHECK-NEXT: movq $-1, %rbx -; CHECK-NEXT: movq $-1, %rdi -; CHECK-NEXT: sbbq %rcx, %rdi -; CHECK-NEXT: cmovgeq %r8, %rsi -; CHECK-NEXT: cmpq %rax, %r8 -; CHECK-NEXT: sbbq %rdx, %rbx -; CHECK-NEXT: cmovgeq %r8, %rax +; CHECK-NEXT: cmovlq %r14, %rcx +; CHECK-NEXT: cmovlq %rbx, %rsi +; CHECK-NEXT: movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000 +; CHECK-NEXT: cmpq %rsi, %rdi +; CHECK-NEXT: movq $-1, %r8 +; CHECK-NEXT: movq $-1, %r9 +; CHECK-NEXT: sbbq %rcx, %r9 +; CHECK-NEXT: cmovgeq %rdi, %rsi +; CHECK-NEXT: cmpq %rax, %rdi +; CHECK-NEXT: sbbq %rdx, %r8 +; CHECK-NEXT: cmovgeq %rdi, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movq %rsi, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -1443,8 +1443,8 @@ ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __fixhfti@PLT -; CHECK-NEXT: movq %rax, %r14 -; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: movq %rdx, %r14 ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx @@ -1454,20 +1454,20 @@ ; CHECK-NEXT: sbbq $0, %rdi ; CHECK-NEXT: cmovgeq %rcx, %rdx ; CHECK-NEXT: cmovgeq %rsi, %rax -; CHECK-NEXT: cmpq %rsi, %r14 -; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: cmpq %rsi, %rbx +; CHECK-NEXT: movq %r14, %rdi ; CHECK-NEXT: sbbq $0, %rdi -; CHECK-NEXT: cmovlq %rbx, %rcx -; CHECK-NEXT: cmovlq %r14, %rsi -; CHECK-NEXT: movabsq $-9223372036854775808, %r8 # imm = 0x8000000000000000 -; CHECK-NEXT: cmpq %rsi, %r8 -; CHECK-NEXT: movq $-1, %rbx -; CHECK-NEXT: movq $-1, %rdi -; CHECK-NEXT: sbbq %rcx, %rdi -; CHECK-NEXT: cmovgeq %r8, %rsi -; CHECK-NEXT: cmpq %rax, %r8 -; CHECK-NEXT: sbbq %rdx, %rbx -; CHECK-NEXT: cmovgeq %r8, %rax +; CHECK-NEXT: cmovlq %r14, %rcx +; CHECK-NEXT: cmovlq %rbx, %rsi +; CHECK-NEXT: movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000 +; CHECK-NEXT: cmpq %rsi, %rdi +; CHECK-NEXT: movq $-1, %r8 +; CHECK-NEXT: movq $-1, %r9 +; CHECK-NEXT: sbbq %rcx, %r9 +; CHECK-NEXT: cmovgeq %rdi, %rsi +; CHECK-NEXT: cmpq %rax, %rdi +; CHECK-NEXT: sbbq %rdx, %r8 +; CHECK-NEXT: cmovgeq %rdi, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movq %rsi, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -1747,26 +1747,26 @@ ; CHECK-NEXT: movdqa %xmm3, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm8, %xmm8 -; CHECK-NEXT: pcmpeqd %xmm8, %xmm4 +; CHECK-NEXT: pxor %xmm5, %xmm5 +; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] ; CHECK-NEXT: movdqa %xmm6, %xmm7 ; CHECK-NEXT: pcmpgtd %xmm1, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; CHECK-NEXT: pand %xmm4, %xmm8 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm5, %xmm1 +; CHECK-NEXT: por %xmm8, %xmm1 ; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] ; CHECK-NEXT: pand %xmm1, %xmm3 ; CHECK-NEXT: pandn %xmm4, %xmm1 ; CHECK-NEXT: por %xmm3, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm8, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm7 ; CHECK-NEXT: pcmpgtd %xmm3, %xmm6 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm3 +; CHECK-NEXT: pand %xmm7, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; CHECK-NEXT: por %xmm3, %xmm5 ; CHECK-NEXT: pand %xmm5, %xmm2 @@ -1911,26 +1911,26 @@ ; CHECK-NEXT: movdqa %xmm3, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm8, %xmm8 -; CHECK-NEXT: pcmpeqd %xmm8, %xmm4 +; CHECK-NEXT: pxor %xmm5, %xmm5 +; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647] ; CHECK-NEXT: movdqa %xmm6, %xmm7 ; CHECK-NEXT: pcmpgtd %xmm1, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; CHECK-NEXT: pand %xmm4, %xmm8 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm5, %xmm1 +; CHECK-NEXT: por %xmm8, %xmm1 ; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] ; CHECK-NEXT: pand %xmm1, %xmm3 ; CHECK-NEXT: pandn %xmm4, %xmm1 ; CHECK-NEXT: por %xmm3, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm8, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm7 ; CHECK-NEXT: pcmpgtd %xmm3, %xmm6 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm3 +; CHECK-NEXT: pand %xmm7, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; CHECK-NEXT: por %xmm3, %xmm5 ; CHECK-NEXT: pand %xmm5, %xmm2 @@ -2709,20 +2709,20 @@ ; CHECK-NEXT: cmoveq %rsi, %rcx ; CHECK-NEXT: cmovsq %r14, %rdi ; CHECK-NEXT: testq %rdi, %rdi -; CHECK-NEXT: movabsq $-9223372036854775808, %rbx # imm = 0x8000000000000000 -; CHECK-NEXT: movq %rbx, %rsi -; CHECK-NEXT: cmovnsq %rcx, %rsi -; CHECK-NEXT: cmpq %rbx, %rcx -; CHECK-NEXT: cmovbeq %rbx, %rcx +; CHECK-NEXT: movabsq $-9223372036854775808, %rsi # imm = 0x8000000000000000 +; CHECK-NEXT: movq %rsi, %r8 +; CHECK-NEXT: cmovnsq %rcx, %r8 +; CHECK-NEXT: cmpq %rsi, %rcx +; CHECK-NEXT: cmovbeq %rsi, %rcx ; CHECK-NEXT: cmpq $-1, %rdi -; CHECK-NEXT: cmovneq %rsi, %rcx +; CHECK-NEXT: cmovneq %r8, %rcx ; CHECK-NEXT: testq %rdx, %rdx -; CHECK-NEXT: movq %rbx, %rsi -; CHECK-NEXT: cmovnsq %rax, %rsi -; CHECK-NEXT: cmpq %rbx, %rax -; CHECK-NEXT: cmovbeq %rbx, %rax +; CHECK-NEXT: movq %rsi, %rdi +; CHECK-NEXT: cmovnsq %rax, %rdi +; CHECK-NEXT: cmpq %rsi, %rax +; CHECK-NEXT: cmovbeq %rsi, %rax ; CHECK-NEXT: cmpq $-1, %rdx -; CHECK-NEXT: cmovneq %rsi, %rax +; CHECK-NEXT: cmovneq %rdi, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movq %rcx, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -2873,20 +2873,20 @@ ; CHECK-NEXT: cmoveq %rsi, %rcx ; CHECK-NEXT: cmovsq %r14, %rdi ; CHECK-NEXT: testq %rdi, %rdi -; CHECK-NEXT: movabsq $-9223372036854775808, %rbx # imm = 0x8000000000000000 -; CHECK-NEXT: movq %rbx, %rsi -; CHECK-NEXT: cmovnsq %rcx, %rsi -; CHECK-NEXT: cmpq %rbx, %rcx -; CHECK-NEXT: cmovbeq %rbx, %rcx +; CHECK-NEXT: movabsq $-9223372036854775808, %rsi # imm = 0x8000000000000000 +; CHECK-NEXT: movq %rsi, %r8 +; CHECK-NEXT: cmovnsq %rcx, %r8 +; CHECK-NEXT: cmpq %rsi, %rcx +; CHECK-NEXT: cmovbeq %rsi, %rcx ; CHECK-NEXT: cmpq $-1, %rdi -; CHECK-NEXT: cmovneq %rsi, %rcx +; CHECK-NEXT: cmovneq %r8, %rcx ; CHECK-NEXT: testq %rdx, %rdx -; CHECK-NEXT: movq %rbx, %rsi -; CHECK-NEXT: cmovnsq %rax, %rsi -; CHECK-NEXT: cmpq %rbx, %rax -; CHECK-NEXT: cmovbeq %rbx, %rax +; CHECK-NEXT: movq %rsi, %rdi +; CHECK-NEXT: cmovnsq %rax, %rdi +; CHECK-NEXT: cmpq %rsi, %rax +; CHECK-NEXT: cmovbeq %rsi, %rax ; CHECK-NEXT: cmpq $-1, %rdx -; CHECK-NEXT: cmovneq %rsi, %rax +; CHECK-NEXT: cmovneq %rdi, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movq %rcx, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -3037,20 +3037,20 @@ ; CHECK-NEXT: cmoveq %rsi, %rcx ; CHECK-NEXT: cmovsq %r14, %rdi ; CHECK-NEXT: testq %rdi, %rdi -; CHECK-NEXT: movabsq $-9223372036854775808, %rbx # imm = 0x8000000000000000 -; CHECK-NEXT: movq %rbx, %rsi -; CHECK-NEXT: cmovnsq %rcx, %rsi -; CHECK-NEXT: cmpq %rbx, %rcx -; CHECK-NEXT: cmovbeq %rbx, %rcx +; CHECK-NEXT: movabsq $-9223372036854775808, %rsi # imm = 0x8000000000000000 +; CHECK-NEXT: movq %rsi, %r8 +; CHECK-NEXT: cmovnsq %rcx, %r8 +; CHECK-NEXT: cmpq %rsi, %rcx +; CHECK-NEXT: cmovbeq %rsi, %rcx ; CHECK-NEXT: cmpq $-1, %rdi -; CHECK-NEXT: cmovneq %rsi, %rcx +; CHECK-NEXT: cmovneq %r8, %rcx ; CHECK-NEXT: testq %rdx, %rdx -; CHECK-NEXT: movq %rbx, %rsi -; CHECK-NEXT: cmovnsq %rax, %rsi -; CHECK-NEXT: cmpq %rbx, %rax -; CHECK-NEXT: cmovbeq %rbx, %rax +; CHECK-NEXT: movq %rsi, %rdi +; CHECK-NEXT: cmovnsq %rax, %rdi +; CHECK-NEXT: cmpq %rsi, %rax +; CHECK-NEXT: cmovbeq %rsi, %rax ; CHECK-NEXT: cmpq $-1, %rdx -; CHECK-NEXT: cmovneq %rsi, %rax +; CHECK-NEXT: cmovneq %rdi, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movq %rcx, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll --- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll @@ -234,44 +234,44 @@ ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movq %rdi, %r12 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT ; CHECK-NEXT: movq %rdx, %r15 -; CHECK-NEXT: xorl %ebp, %ebp +; CHECK-NEXT: xorl %r14d, %r14d ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %rbp, %rax +; CHECK-NEXT: cmovbq %r14, %rax ; CHECK-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; CHECK-NEXT: cmovbq %rcx, %r15 ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movabsq $9223372036854775807, %r14 # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmovaq %r14, %r15 +; CHECK-NEXT: movabsq $9223372036854775807, %rbp # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovaq %rbp, %r15 ; CHECK-NEXT: movq $-1, %rcx ; CHECK-NEXT: cmovaq %rcx, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %rbp, %rax +; CHECK-NEXT: cmovpq %r14, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: cmovpq %rbp, %r15 +; CHECK-NEXT: cmovpq %r14, %r15 ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT -; CHECK-NEXT: movq %rax, %r13 -; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: movq %rax, %r12 +; CHECK-NEXT: movq %rdx, %r13 ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %rbp, %r13 +; CHECK-NEXT: cmovbq %r14, %r12 ; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; CHECK-NEXT: cmovbq %rax, %rbx +; CHECK-NEXT: cmovbq %rax, %r13 ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r14, %rbx +; CHECK-NEXT: cmovaq %rbp, %r13 ; CHECK-NEXT: movq $-1, %rax -; CHECK-NEXT: cmovaq %rax, %r13 +; CHECK-NEXT: cmovaq %rax, %r12 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %rbp, %r13 -; CHECK-NEXT: cmovpq %rbp, %rbx +; CHECK-NEXT: cmovpq %r14, %r12 +; CHECK-NEXT: cmovpq %r14, %r13 ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -309,16 +309,16 @@ ; CHECK-NEXT: cmovpq %rsi, %rax ; CHECK-NEXT: movl $0, %ecx ; CHECK-NEXT: cmovpq %rcx, %rdx -; CHECK-NEXT: movq %rdx, 8(%r12) -; CHECK-NEXT: movq %rax, (%r12) -; CHECK-NEXT: movq %r14, 56(%r12) -; CHECK-NEXT: movq %rbp, 48(%r12) -; CHECK-NEXT: movq %rbx, 40(%r12) -; CHECK-NEXT: movq %r13, 32(%r12) -; CHECK-NEXT: movq %r15, 24(%r12) +; CHECK-NEXT: movq %rdx, 8(%rbx) +; CHECK-NEXT: movq %rax, (%rbx) +; CHECK-NEXT: movq %r14, 56(%rbx) +; CHECK-NEXT: movq %rbp, 48(%rbx) +; CHECK-NEXT: movq %r13, 40(%rbx) +; CHECK-NEXT: movq %r12, 32(%rbx) +; CHECK-NEXT: movq %r15, 24(%rbx) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: movq %rax, 16(%r12) -; CHECK-NEXT: movq %r12, %rax +; CHECK-NEXT: movq %rax, 16(%rbx) +; CHECK-NEXT: movq %rbx, %rax ; CHECK-NEXT: addq $56, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 @@ -477,26 +477,26 @@ ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movq %rdi, %r14 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: callq __fixdfti@PLT -; CHECK-NEXT: movq %rax, %r15 -; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: movq %rax, %r14 +; CHECK-NEXT: movq %rdx, %r15 ; CHECK-NEXT: xorl %r12d, %r12d ; CHECK-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r12, %r15 +; CHECK-NEXT: cmovbq %r12, %r14 ; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; CHECK-NEXT: cmovbq %rax, %rbx +; CHECK-NEXT: cmovbq %rax, %r15 ; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movabsq $9223372036854775807, %rbp # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmovaq %rbp, %rbx +; CHECK-NEXT: cmovaq %rbp, %r15 ; CHECK-NEXT: movq $-1, %r13 -; CHECK-NEXT: cmovaq %r13, %r15 +; CHECK-NEXT: cmovaq %r13, %r14 ; CHECK-NEXT: ucomisd %xmm0, %xmm0 +; CHECK-NEXT: cmovpq %r12, %r14 ; CHECK-NEXT: cmovpq %r12, %r15 -; CHECK-NEXT: cmovpq %r12, %rbx ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __fixdfti@PLT ; CHECK-NEXT: movapd (%rsp), %xmm0 # 16-byte Reload @@ -510,11 +510,11 @@ ; CHECK-NEXT: ucomisd %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: cmovpq %r12, %rdx -; CHECK-NEXT: movq %rdx, 8(%r14) -; CHECK-NEXT: movq %rax, (%r14) -; CHECK-NEXT: movq %rbx, 24(%r14) -; CHECK-NEXT: movq %r15, 16(%r14) -; CHECK-NEXT: movq %r14, %rax +; CHECK-NEXT: movq %rdx, 8(%rbx) +; CHECK-NEXT: movq %rax, (%rbx) +; CHECK-NEXT: movq %r15, 24(%rbx) +; CHECK-NEXT: movq %r14, 16(%rbx) +; CHECK-NEXT: movq %rbx, %rax ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 @@ -675,103 +675,103 @@ ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %ebp +; CHECK-NEXT: cvttss2si %xmm0, %r12d ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movl $128, %r14d -; CHECK-NEXT: cmovbl %r14d, %ebp +; CHECK-NEXT: movl $128, %ebx +; CHECK-NEXT: cmovbl %ebx, %r12d ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movl $127, %r12d -; CHECK-NEXT: cmoval %r12d, %ebp -; CHECK-NEXT: xorl %r15d, %r15d +; CHECK-NEXT: movl $127, %ebp +; CHECK-NEXT: cmoval %ebp, %r12d +; CHECK-NEXT: xorl %r14d, %r14d ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r15d, %ebp -; CHECK-NEXT: shll $8, %ebp +; CHECK-NEXT: cmovpl %r14d, %r12d +; CHECK-NEXT: shll $8, %r12d ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r15d, %eax -; CHECK-NEXT: movzbl %al, %ebx -; CHECK-NEXT: orl %ebp, %ebx +; CHECK-NEXT: cmovpl %r14d, %eax +; CHECK-NEXT: movzbl %al, %r15d +; CHECK-NEXT: orl %r12d, %r15d ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r15d, %eax -; CHECK-NEXT: movzbl %al, %ebp +; CHECK-NEXT: cmovpl %r14d, %eax +; CHECK-NEXT: movzbl %al, %r12d ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r15d, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: shll $8, %eax -; CHECK-NEXT: orl %ebp, %eax +; CHECK-NEXT: orl %r12d, %eax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: pinsrw $1, %ebx, %xmm0 +; CHECK-NEXT: pinsrw $1, %r15d, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %ebx +; CHECK-NEXT: cvttss2si %xmm0, %r15d ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %ebx +; CHECK-NEXT: cmovbl %ebx, %r15d ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r12d, %ebx +; CHECK-NEXT: cmoval %ebp, %r15d ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r15d, %ebx -; CHECK-NEXT: shll $8, %ebx +; CHECK-NEXT: cmovpl %r14d, %r15d +; CHECK-NEXT: shll $8, %r15d ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r15d, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: orl %ebx, %eax +; CHECK-NEXT: orl %r15d, %eax ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: pinsrw $2, %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %ebx +; CHECK-NEXT: cvttss2si %xmm0, %r15d ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %ebx +; CHECK-NEXT: cmovbl %ebx, %r15d ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r12d, %ebx +; CHECK-NEXT: cmoval %ebp, %r15d ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r15d, %ebx -; CHECK-NEXT: shll $8, %ebx +; CHECK-NEXT: cmovpl %r14d, %r15d +; CHECK-NEXT: shll $8, %r15d ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r12d, %eax +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r15d, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: orl %ebx, %eax +; CHECK-NEXT: orl %r15d, %eax ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: pinsrw $3, %eax, %xmm0 ; CHECK-NEXT: addq $32, %rsp @@ -797,14 +797,14 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movl $32768, %r14d # imm = 0x8000 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: movl $32768, %ebx # imm = 0x8000 +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movl $32767, %ebp # imm = 0x7FFF ; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: xorl %r14d, %r14d ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -812,11 +812,11 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] @@ -826,11 +826,11 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -838,11 +838,11 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] @@ -854,11 +854,11 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -866,11 +866,11 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] @@ -879,11 +879,11 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload @@ -891,11 +891,11 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -924,14 +924,14 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movl $-2147483648, %r14d # imm = 0x80000000 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: movl $-2147483648, %ebx # imm = 0x80000000 +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movl $2147483647, %ebp # imm = 0x7FFFFFFF ; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: xorl %r14d, %r14d ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -939,11 +939,11 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] @@ -952,11 +952,11 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload @@ -964,11 +964,11 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -980,11 +980,11 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -992,11 +992,11 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] @@ -1006,11 +1006,11 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -1018,11 +1018,11 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ebx, %eax +; CHECK-NEXT: cmovpl %r14d, %eax ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] @@ -1049,11 +1049,11 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movabsq $-9223372036854775808, %r14 # imm = 0x8000000000000000 -; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: movabsq $-9223372036854775808, %rbx # imm = 0x8000000000000000 +; CHECK-NEXT: cmovbq %rbx, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movabsq $9223372036854775807, %rbx # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmovaq %rbx, %rax +; CHECK-NEXT: movabsq $9223372036854775807, %r14 # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovaq %r14, %rax ; CHECK-NEXT: xorl %r15d, %r15d ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r15, %rax @@ -1064,9 +1064,9 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: cmovbq %rbx, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbx, %rax +; CHECK-NEXT: cmovaq %r14, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 @@ -1078,9 +1078,9 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: cmovbq %rbx, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbx, %rax +; CHECK-NEXT: cmovaq %r14, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 @@ -1090,9 +1090,9 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: cmovbq %rbx, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbx, %rax +; CHECK-NEXT: cmovaq %r14, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 @@ -1104,9 +1104,9 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: cmovbq %rbx, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbx, %rax +; CHECK-NEXT: cmovaq %r14, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 @@ -1116,9 +1116,9 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: cmovbq %rbx, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbx, %rax +; CHECK-NEXT: cmovaq %r14, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 @@ -1130,9 +1130,9 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: cmovbq %rbx, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbx, %rax +; CHECK-NEXT: cmovaq %r14, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 @@ -1142,9 +1142,9 @@ ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: cmovbq %rbx, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbx, %rax +; CHECK-NEXT: cmovaq %r14, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm3 @@ -1189,10 +1189,10 @@ ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF ; CHECK-NEXT: cmovaq %rcx, %rdx -; CHECK-NEXT: movq %rcx, %rbp +; CHECK-NEXT: movq %rcx, %r15 ; CHECK-NEXT: movq $-1, %rcx ; CHECK-NEXT: cmovaq %rcx, %rax -; CHECK-NEXT: movq $-1, %r15 +; CHECK-NEXT: movq $-1, %r13 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1209,9 +1209,9 @@ ; CHECK-NEXT: cmovbq %r12, %rax ; CHECK-NEXT: cmovbq %r14, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbp, %rdx -; CHECK-NEXT: cmovaq %r15, %rax -; CHECK-NEXT: movq $-1, %r15 +; CHECK-NEXT: cmovaq %r15, %rdx +; CHECK-NEXT: cmovaq %r13, %rax +; CHECK-NEXT: movq $-1, %r13 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1228,9 +1228,9 @@ ; CHECK-NEXT: cmovbq %r12, %rax ; CHECK-NEXT: cmovbq %r14, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbp, %rdx -; CHECK-NEXT: cmovaq %r15, %rax -; CHECK-NEXT: movq $-1, %r15 +; CHECK-NEXT: cmovaq %r15, %rdx +; CHECK-NEXT: cmovaq %r13, %rax +; CHECK-NEXT: movq $-1, %r13 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1247,10 +1247,8 @@ ; CHECK-NEXT: cmovbq %r12, %rax ; CHECK-NEXT: cmovbq %r14, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbp, %rdx -; CHECK-NEXT: movq %rbp, %r13 -; CHECK-NEXT: cmovaq %r15, %rax -; CHECK-NEXT: movq $-1, %r15 +; CHECK-NEXT: cmovaq %r15, %rdx +; CHECK-NEXT: cmovaq %r13, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1267,9 +1265,11 @@ ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r12, %rax ; CHECK-NEXT: cmovbq %r14, %rbp +; CHECK-NEXT: movq %r14, %r13 ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r13, %rbp -; CHECK-NEXT: cmovaq %r15, %rax +; CHECK-NEXT: cmovaq %r15, %rbp +; CHECK-NEXT: movq $-1, %rcx +; CHECK-NEXT: cmovaq %rcx, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1285,10 +1285,10 @@ ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r12, %r14 -; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; CHECK-NEXT: cmovbq %rax, %r15 +; CHECK-NEXT: cmovbq %r13, %r15 ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %r13, %r15 +; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovaq %rax, %r15 ; CHECK-NEXT: movq $-1, %rax ; CHECK-NEXT: cmovaq %rax, %r14 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll --- a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll @@ -485,22 +485,22 @@ ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $32, %rsp ; CHECK-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movq %rdi, %r14 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: callq __fixunsdfti@PLT -; CHECK-NEXT: movq %rax, %r15 -; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: movq %rax, %r14 +; CHECK-NEXT: movq %rdx, %r15 ; CHECK-NEXT: xorl %r12d, %r12d ; CHECK-NEXT: xorpd %xmm0, %xmm0 ; CHECK-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: ucomisd %xmm0, %xmm1 -; CHECK-NEXT: cmovbq %r12, %rbx ; CHECK-NEXT: cmovbq %r12, %r15 +; CHECK-NEXT: cmovbq %r12, %r14 ; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: movq $-1, %r13 +; CHECK-NEXT: cmovaq %r13, %r14 ; CHECK-NEXT: cmovaq %r13, %r15 -; CHECK-NEXT: cmovaq %r13, %rbx ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __fixunsdfti@PLT ; CHECK-NEXT: movapd (%rsp), %xmm0 # 16-byte Reload @@ -510,11 +510,11 @@ ; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r13, %rax ; CHECK-NEXT: cmovaq %r13, %rdx -; CHECK-NEXT: movq %rdx, 8(%r14) -; CHECK-NEXT: movq %rax, (%r14) -; CHECK-NEXT: movq %rbx, 24(%r14) -; CHECK-NEXT: movq %r15, 16(%r14) -; CHECK-NEXT: movq %r14, %rax +; CHECK-NEXT: movq %rdx, 8(%rbx) +; CHECK-NEXT: movq %rax, (%rbx) +; CHECK-NEXT: movq %r15, 24(%rbx) +; CHECK-NEXT: movq %r14, 16(%rbx) +; CHECK-NEXT: movq %rbx, %rax ; CHECK-NEXT: addq $32, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 @@ -657,87 +657,87 @@ ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %ebp -; CHECK-NEXT: xorl %r14d, %r14d +; CHECK-NEXT: cvttss2si %xmm0, %r15d +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: cmovbl %r14d, %ebp +; CHECK-NEXT: cmovbl %ebx, %r15d ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movl $255, %r15d -; CHECK-NEXT: cmoval %r15d, %ebp -; CHECK-NEXT: shll $8, %ebp +; CHECK-NEXT: movl $255, %ebp +; CHECK-NEXT: cmoval %ebp, %r15d +; CHECK-NEXT: shll $8, %r15d ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r15d, %eax -; CHECK-NEXT: movzbl %al, %ebx -; CHECK-NEXT: orl %ebp, %ebx +; CHECK-NEXT: cmoval %ebp, %eax +; CHECK-NEXT: movzbl %al, %r14d +; CHECK-NEXT: orl %r15d, %r14d ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r15d, %eax -; CHECK-NEXT: movzbl %al, %ebp +; CHECK-NEXT: cmoval %ebp, %eax +; CHECK-NEXT: movzbl %al, %r15d ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r15d, %eax +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: shll $8, %eax -; CHECK-NEXT: orl %ebp, %eax +; CHECK-NEXT: orl %r15d, %eax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: pinsrw $1, %ebx, %xmm0 +; CHECK-NEXT: pinsrw $1, %r14d, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %ebx +; CHECK-NEXT: cvttss2si %xmm0, %r14d ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %ebx +; CHECK-NEXT: cmovbl %ebx, %r14d ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r15d, %ebx -; CHECK-NEXT: shll $8, %ebx +; CHECK-NEXT: cmoval %ebp, %r14d +; CHECK-NEXT: shll $8, %r14d ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r15d, %eax +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: orl %ebx, %eax +; CHECK-NEXT: orl %r14d, %eax ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: pinsrw $2, %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %ebx +; CHECK-NEXT: cvttss2si %xmm0, %r14d ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %ebx +; CHECK-NEXT: cmovbl %ebx, %r14d ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r15d, %ebx -; CHECK-NEXT: shll $8, %ebx +; CHECK-NEXT: cmoval %ebp, %r14d +; CHECK-NEXT: shll $8, %r14d ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %r14d, %eax +; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %r15d, %eax +; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: orl %ebx, %eax +; CHECK-NEXT: orl %r14d, %eax ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: pinsrw $3, %eax, %xmm0 ; CHECK-NEXT: addq $40, %rsp @@ -983,13 +983,13 @@ ; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: xorl %r14d, %r14d +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: cmovbq %r14, %rdx +; CHECK-NEXT: cmovbq %rbx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movq $-1, %rbx -; CHECK-NEXT: cmovaq %rbx, %rdx +; CHECK-NEXT: movq $-1, %r14 +; CHECK-NEXT: cmovaq %r14, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload @@ -1004,9 +1004,9 @@ ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r14, %rdx +; CHECK-NEXT: cmovbq %rbx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbx, %rdx +; CHECK-NEXT: cmovaq %r14, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -1023,9 +1023,9 @@ ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r14, %rdx +; CHECK-NEXT: cmovbq %rbx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbx, %rdx +; CHECK-NEXT: cmovaq %r14, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -1040,9 +1040,9 @@ ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r14, %rdx +; CHECK-NEXT: cmovbq %rbx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbx, %rdx +; CHECK-NEXT: cmovaq %r14, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] @@ -1059,9 +1059,9 @@ ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r14, %rdx +; CHECK-NEXT: cmovbq %rbx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbx, %rdx +; CHECK-NEXT: cmovaq %r14, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -1076,9 +1076,9 @@ ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r14, %rdx +; CHECK-NEXT: cmovbq %rbx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbx, %rdx +; CHECK-NEXT: cmovaq %r14, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] @@ -1095,9 +1095,9 @@ ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r14, %rdx +; CHECK-NEXT: cmovbq %rbx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbx, %rdx +; CHECK-NEXT: cmovaq %r14, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -1112,9 +1112,9 @@ ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r14, %rdx +; CHECK-NEXT: cmovbq %rbx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovaq %rbx, %rdx +; CHECK-NEXT: cmovaq %r14, %rdx ; CHECK-NEXT: movq %rdx, %xmm3 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; CHECK-NEXT: # xmm3 = xmm3[0],mem[0] diff --git a/llvm/test/CodeGen/X86/gather-addresses.ll b/llvm/test/CodeGen/X86/gather-addresses.ll --- a/llvm/test/CodeGen/X86/gather-addresses.ll +++ b/llvm/test/CodeGen/X86/gather-addresses.ll @@ -53,17 +53,17 @@ ; WIN-SSE2: # %bb.0: ; WIN-SSE2-NEXT: movdqa (%rdx), %xmm0 ; WIN-SSE2-NEXT: pand (%r8), %xmm0 -; WIN-SSE2-NEXT: movd %xmm0, %r8d +; WIN-SSE2-NEXT: movd %xmm0, %eax ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; WIN-SSE2-NEXT: movd %xmm1, %r9d +; WIN-SSE2-NEXT: movd %xmm1, %edx ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; WIN-SSE2-NEXT: movd %xmm1, %r10d +; WIN-SSE2-NEXT: movd %xmm1, %r8d ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; WIN-SSE2-NEXT: movd %xmm0, %edx -; WIN-SSE2-NEXT: movslq %r8d, %rax -; WIN-SSE2-NEXT: movslq %r9d, %r8 -; WIN-SSE2-NEXT: movslq %r10d, %r9 +; WIN-SSE2-NEXT: movd %xmm0, %r9d +; WIN-SSE2-NEXT: cltq ; WIN-SSE2-NEXT: movslq %edx, %rdx +; WIN-SSE2-NEXT: movslq %r8d, %r8 +; WIN-SSE2-NEXT: movslq %r9d, %r9 ; WIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; WIN-SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; WIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero @@ -185,18 +185,18 @@ ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; WIN-SSE2-NEXT: movd %xmm1, %ecx ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; WIN-SSE2-NEXT: movd %xmm1, %r8d +; WIN-SSE2-NEXT: movd %xmm1, %edx ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; WIN-SSE2-NEXT: movd %xmm0, %edx +; WIN-SSE2-NEXT: movd %xmm0, %r8d ; WIN-SSE2-NEXT: andl %r9d, %eax ; WIN-SSE2-NEXT: andl %r9d, %ecx -; WIN-SSE2-NEXT: andl %r9d, %r8d ; WIN-SSE2-NEXT: andl %r9d, %edx +; WIN-SSE2-NEXT: andl %r9d, %r8d ; WIN-SSE2-NEXT: movq %rax, %xmm0 ; WIN-SSE2-NEXT: movq %rcx, %xmm1 ; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; WIN-SSE2-NEXT: movq %rdx, %xmm2 -; WIN-SSE2-NEXT: movq %r8, %xmm1 +; WIN-SSE2-NEXT: movq %r8, %xmm2 +; WIN-SSE2-NEXT: movq %rdx, %xmm1 ; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; WIN-SSE2-NEXT: retq ; @@ -206,17 +206,17 @@ ; WIN-SSE4-NEXT: pand (%r8), %xmm0 ; WIN-SSE4-NEXT: movd %xmm0, %eax ; WIN-SSE4-NEXT: pextrd $1, %xmm0, %ecx -; WIN-SSE4-NEXT: pextrd $2, %xmm0, %r8d -; WIN-SSE4-NEXT: pextrd $3, %xmm0, %edx +; WIN-SSE4-NEXT: pextrd $2, %xmm0, %edx +; WIN-SSE4-NEXT: pextrd $3, %xmm0, %r8d ; WIN-SSE4-NEXT: andl %r9d, %eax ; WIN-SSE4-NEXT: andl %r9d, %ecx -; WIN-SSE4-NEXT: andl %r9d, %r8d ; WIN-SSE4-NEXT: andl %r9d, %edx +; WIN-SSE4-NEXT: andl %r9d, %r8d ; WIN-SSE4-NEXT: movq %rcx, %xmm1 ; WIN-SSE4-NEXT: movq %rax, %xmm0 ; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; WIN-SSE4-NEXT: movq %rdx, %xmm2 -; WIN-SSE4-NEXT: movq %r8, %xmm1 +; WIN-SSE4-NEXT: movq %r8, %xmm2 +; WIN-SSE4-NEXT: movq %rdx, %xmm1 ; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; WIN-SSE4-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/h-registers-1.ll b/llvm/test/CodeGen/X86/h-registers-1.ll --- a/llvm/test/CodeGen/X86/h-registers-1.ll +++ b/llvm/test/CodeGen/X86/h-registers-1.ll @@ -19,21 +19,20 @@ ; CHECK-NEXT: movq %rsi, %rax ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movzbl %bh, %esi -; CHECK-NEXT: movzbl %ah, %eax -; CHECK-NEXT: movq %rax, %r10 +; CHECK-NEXT: movzbl %ah, %edi ; CHECK-NEXT: movzbl %dh, %edx ; CHECK-NEXT: movzbl %ch, %ebp ; CHECK-NEXT: movq %r8, %rax ; CHECK-NEXT: movzbl %ah, %ecx ; CHECK-NEXT: movq %r9, %rax -; CHECK-NEXT: movzbl %ah, %edi +; CHECK-NEXT: movzbl %ah, %ebx ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-NEXT: addq %r10, %rsi +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: addq %rdi, %rsi ; CHECK-NEXT: addq %rbp, %rdx ; CHECK-NEXT: addq %rsi, %rdx -; CHECK-NEXT: addq %rdi, %rcx -; CHECK-NEXT: addq %rbx, %rax +; CHECK-NEXT: addq %rbx, %rcx +; CHECK-NEXT: addq %r8, %rax ; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: popq %rbx @@ -53,21 +52,20 @@ ; GNUX32-NEXT: movq %rsi, %rax ; GNUX32-NEXT: movq %rdi, %rbx ; GNUX32-NEXT: movzbl %bh, %esi -; GNUX32-NEXT: movzbl %ah, %eax -; GNUX32-NEXT: movq %rax, %r10 +; GNUX32-NEXT: movzbl %ah, %edi ; GNUX32-NEXT: movzbl %dh, %edx ; GNUX32-NEXT: movzbl %ch, %ebp ; GNUX32-NEXT: movq %r8, %rax ; GNUX32-NEXT: movzbl %ah, %ecx ; GNUX32-NEXT: movq %r9, %rax -; GNUX32-NEXT: movzbl %ah, %edi +; GNUX32-NEXT: movzbl %ah, %ebx ; GNUX32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; GNUX32-NEXT: movzbl {{[0-9]+}}(%esp), %ebx -; GNUX32-NEXT: addq %r10, %rsi +; GNUX32-NEXT: movzbl {{[0-9]+}}(%esp), %r8d +; GNUX32-NEXT: addq %rdi, %rsi ; GNUX32-NEXT: addq %rbp, %rdx ; GNUX32-NEXT: addq %rsi, %rdx -; GNUX32-NEXT: addq %rdi, %rcx -; GNUX32-NEXT: addq %rbx, %rax +; GNUX32-NEXT: addq %rbx, %rcx +; GNUX32-NEXT: addq %r8, %rax ; GNUX32-NEXT: addq %rcx, %rax ; GNUX32-NEXT: addq %rdx, %rax ; GNUX32-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -519,13 +519,13 @@ ; SSE3: # %bb.0: ; SSE3-NEXT: movd %xmm0, %ecx ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE3-NEXT: movd %xmm4, %r8d -; SSE3-NEXT: addl %ecx, %r8d +; SSE3-NEXT: movd %xmm4, %eax +; SSE3-NEXT: addl %ecx, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm4, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE3-NEXT: movd %xmm0, %r9d -; SSE3-NEXT: addl %edx, %r9d +; SSE3-NEXT: movd %xmm0, %ecx +; SSE3-NEXT: addl %edx, %ecx ; SSE3-NEXT: movd %xmm1, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE3-NEXT: movd %xmm0, %esi @@ -535,36 +535,36 @@ ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] ; SSE3-NEXT: movd %xmm0, %edi ; SSE3-NEXT: addl %edx, %edi -; SSE3-NEXT: movd %xmm2, %eax +; SSE3-NEXT: movd %xmm2, %r8d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE3-NEXT: movd %xmm0, %r10d -; SSE3-NEXT: addl %eax, %r10d +; SSE3-NEXT: movd %xmm0, %edx +; SSE3-NEXT: addl %r8d, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: movd %xmm0, %r8d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] -; SSE3-NEXT: movd %xmm0, %ecx -; SSE3-NEXT: addl %eax, %ecx -; SSE3-NEXT: movd %xmm3, %eax +; SSE3-NEXT: movd %xmm0, %r9d +; SSE3-NEXT: addl %r8d, %r9d +; SSE3-NEXT: movd %xmm3, %r8d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE3-NEXT: movd %xmm0, %edx -; SSE3-NEXT: addl %eax, %edx +; SSE3-NEXT: movd %xmm0, %r10d +; SSE3-NEXT: addl %r8d, %r10d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE3-NEXT: movd %xmm0, %r11d +; SSE3-NEXT: movd %xmm0, %r8d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] -; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: addl %r11d, %eax +; SSE3-NEXT: movd %xmm0, %r11d +; SSE3-NEXT: addl %r8d, %r11d ; SSE3-NEXT: movd %edi, %xmm0 ; SSE3-NEXT: movd %esi, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: movd %r9d, %xmm2 -; SSE3-NEXT: movd %r8d, %xmm0 +; SSE3-NEXT: movd %ecx, %xmm2 +; SSE3-NEXT: movd %eax, %xmm0 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE3-NEXT: movd %eax, %xmm1 -; SSE3-NEXT: movd %edx, %xmm2 +; SSE3-NEXT: movd %r11d, %xmm1 +; SSE3-NEXT: movd %r10d, %xmm2 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE3-NEXT: movd %ecx, %xmm3 -; SSE3-NEXT: movd %r10d, %xmm1 +; SSE3-NEXT: movd %r9d, %xmm3 +; SSE3-NEXT: movd %edx, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE3-NEXT: retq @@ -634,88 +634,88 @@ ; SSE3-NEXT: pushq %r13 ; SSE3-NEXT: pushq %r12 ; SSE3-NEXT: pushq %rbx -; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: pextrw $1, %xmm0, %ecx -; SSE3-NEXT: addl %eax, %ecx -; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE3-NEXT: pextrw $2, %xmm0, %eax -; SSE3-NEXT: pextrw $3, %xmm0, %ecx -; SSE3-NEXT: addl %eax, %ecx -; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE3-NEXT: pextrw $4, %xmm0, %eax -; SSE3-NEXT: pextrw $5, %xmm0, %r11d -; SSE3-NEXT: addl %eax, %r11d -; SSE3-NEXT: pextrw $6, %xmm0, %eax -; SSE3-NEXT: pextrw $7, %xmm0, %r15d -; SSE3-NEXT: addl %eax, %r15d -; SSE3-NEXT: movd %xmm1, %eax -; SSE3-NEXT: pextrw $1, %xmm1, %r13d -; SSE3-NEXT: addl %eax, %r13d -; SSE3-NEXT: pextrw $2, %xmm1, %eax +; SSE3-NEXT: movd %xmm0, %ecx +; SSE3-NEXT: pextrw $1, %xmm0, %eax +; SSE3-NEXT: addl %ecx, %eax +; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE3-NEXT: pextrw $2, %xmm0, %edx +; SSE3-NEXT: pextrw $3, %xmm0, %eax +; SSE3-NEXT: addl %edx, %eax +; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE3-NEXT: pextrw $4, %xmm0, %edx +; SSE3-NEXT: pextrw $5, %xmm0, %esi +; SSE3-NEXT: addl %edx, %esi +; SSE3-NEXT: pextrw $6, %xmm0, %edx +; SSE3-NEXT: pextrw $7, %xmm0, %r8d +; SSE3-NEXT: addl %edx, %r8d +; SSE3-NEXT: movd %xmm1, %edx +; SSE3-NEXT: pextrw $1, %xmm1, %r10d +; SSE3-NEXT: addl %edx, %r10d +; SSE3-NEXT: pextrw $2, %xmm1, %edx ; SSE3-NEXT: pextrw $3, %xmm1, %ebx -; SSE3-NEXT: addl %eax, %ebx -; SSE3-NEXT: pextrw $4, %xmm1, %eax -; SSE3-NEXT: pextrw $5, %xmm1, %r8d -; SSE3-NEXT: addl %eax, %r8d -; SSE3-NEXT: pextrw $6, %xmm1, %eax -; SSE3-NEXT: pextrw $7, %xmm1, %esi -; SSE3-NEXT: addl %eax, %esi -; SSE3-NEXT: movd %xmm2, %eax -; SSE3-NEXT: pextrw $1, %xmm2, %r10d -; SSE3-NEXT: addl %eax, %r10d -; SSE3-NEXT: pextrw $2, %xmm2, %eax -; SSE3-NEXT: pextrw $3, %xmm2, %r14d -; SSE3-NEXT: addl %eax, %r14d -; SSE3-NEXT: pextrw $4, %xmm2, %eax -; SSE3-NEXT: pextrw $5, %xmm2, %r12d -; SSE3-NEXT: addl %eax, %r12d -; SSE3-NEXT: pextrw $6, %xmm2, %eax -; SSE3-NEXT: pextrw $7, %xmm2, %r9d -; SSE3-NEXT: addl %eax, %r9d -; SSE3-NEXT: movd %xmm3, %eax +; SSE3-NEXT: addl %edx, %ebx +; SSE3-NEXT: pextrw $4, %xmm1, %edx +; SSE3-NEXT: pextrw $5, %xmm1, %r14d +; SSE3-NEXT: addl %edx, %r14d +; SSE3-NEXT: pextrw $6, %xmm1, %edx +; SSE3-NEXT: pextrw $7, %xmm1, %r12d +; SSE3-NEXT: addl %edx, %r12d +; SSE3-NEXT: movd %xmm2, %edi +; SSE3-NEXT: pextrw $1, %xmm2, %edx +; SSE3-NEXT: addl %edi, %edx +; SSE3-NEXT: pextrw $2, %xmm2, %r9d +; SSE3-NEXT: pextrw $3, %xmm2, %edi +; SSE3-NEXT: addl %r9d, %edi +; SSE3-NEXT: pextrw $4, %xmm2, %r11d +; SSE3-NEXT: pextrw $5, %xmm2, %r9d +; SSE3-NEXT: addl %r11d, %r9d +; SSE3-NEXT: pextrw $6, %xmm2, %ebp +; SSE3-NEXT: pextrw $7, %xmm2, %r11d +; SSE3-NEXT: addl %ebp, %r11d +; SSE3-NEXT: movd %xmm3, %r15d ; SSE3-NEXT: pextrw $1, %xmm3, %ebp -; SSE3-NEXT: addl %eax, %ebp -; SSE3-NEXT: pextrw $2, %xmm3, %edx -; SSE3-NEXT: pextrw $3, %xmm3, %edi -; SSE3-NEXT: addl %edx, %edi -; SSE3-NEXT: pextrw $4, %xmm3, %edx +; SSE3-NEXT: addl %r15d, %ebp +; SSE3-NEXT: pextrw $2, %xmm3, %r13d +; SSE3-NEXT: pextrw $3, %xmm3, %r15d +; SSE3-NEXT: addl %r13d, %r15d +; SSE3-NEXT: pextrw $4, %xmm3, %r13d ; SSE3-NEXT: pextrw $5, %xmm3, %ecx -; SSE3-NEXT: addl %edx, %ecx -; SSE3-NEXT: pextrw $6, %xmm3, %edx +; SSE3-NEXT: addl %r13d, %ecx +; SSE3-NEXT: pextrw $6, %xmm3, %r13d ; SSE3-NEXT: pextrw $7, %xmm3, %eax -; SSE3-NEXT: addl %edx, %eax -; SSE3-NEXT: movd %esi, %xmm8 -; SSE3-NEXT: movd %r8d, %xmm3 -; SSE3-NEXT: movd %ebx, %xmm9 -; SSE3-NEXT: movd %r13d, %xmm4 -; SSE3-NEXT: movd %r15d, %xmm10 -; SSE3-NEXT: movd %r11d, %xmm7 -; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload -; SSE3-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSE3-NEXT: addl %r13d, %eax +; SSE3-NEXT: movd %r12d, %xmm2 +; SSE3-NEXT: movd %r14d, %xmm3 +; SSE3-NEXT: movd %ebx, %xmm5 +; SSE3-NEXT: movd %r10d, %xmm4 +; SSE3-NEXT: movd %r8d, %xmm6 +; SSE3-NEXT: movd %esi, %xmm7 +; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSE3-NEXT: # xmm8 = mem[0],zero,zero,zero ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE3-NEXT: movd %eax, %xmm12 -; SSE3-NEXT: movd %ecx, %xmm6 -; SSE3-NEXT: movd %edi, %xmm13 -; SSE3-NEXT: movd %ebp, %xmm5 +; SSE3-NEXT: movd %eax, %xmm9 +; SSE3-NEXT: movd %ecx, %xmm10 +; SSE3-NEXT: movd %r15d, %xmm11 +; SSE3-NEXT: movd %ebp, %xmm12 +; SSE3-NEXT: movd %r11d, %xmm13 ; SSE3-NEXT: movd %r9d, %xmm14 -; SSE3-NEXT: movd %r12d, %xmm2 -; SSE3-NEXT: movd %r14d, %xmm15 -; SSE3-NEXT: movd %r10d, %xmm1 -; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE3-NEXT: movd %edi, %xmm15 +; SSE3-NEXT: movd %edx, %xmm1 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm12[0] ; SSE3-NEXT: popq %rbx ; SSE3-NEXT: popq %r12 ; SSE3-NEXT: popq %r13 @@ -1133,13 +1133,13 @@ ; SSE3: # %bb.0: ; SSE3-NEXT: movd %xmm0, %ecx ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE3-NEXT: movd %xmm4, %r8d -; SSE3-NEXT: addl %ecx, %r8d +; SSE3-NEXT: movd %xmm4, %eax +; SSE3-NEXT: addl %ecx, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE3-NEXT: movd %xmm4, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE3-NEXT: movd %xmm0, %r9d -; SSE3-NEXT: addl %edx, %r9d +; SSE3-NEXT: movd %xmm0, %ecx +; SSE3-NEXT: addl %edx, %ecx ; SSE3-NEXT: movd %xmm2, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE3-NEXT: movd %xmm0, %esi @@ -1149,36 +1149,36 @@ ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE3-NEXT: movd %xmm0, %edi ; SSE3-NEXT: addl %edx, %edi -; SSE3-NEXT: movd %xmm1, %eax +; SSE3-NEXT: movd %xmm1, %r8d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE3-NEXT: movd %xmm0, %r10d -; SSE3-NEXT: addl %eax, %r10d +; SSE3-NEXT: movd %xmm0, %edx +; SSE3-NEXT: addl %r8d, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: movd %xmm0, %r8d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; SSE3-NEXT: movd %xmm0, %ecx -; SSE3-NEXT: addl %eax, %ecx -; SSE3-NEXT: movd %xmm3, %eax +; SSE3-NEXT: movd %xmm0, %r9d +; SSE3-NEXT: addl %r8d, %r9d +; SSE3-NEXT: movd %xmm3, %r8d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE3-NEXT: movd %xmm0, %edx -; SSE3-NEXT: addl %eax, %edx +; SSE3-NEXT: movd %xmm0, %r10d +; SSE3-NEXT: addl %r8d, %r10d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE3-NEXT: movd %xmm0, %r11d +; SSE3-NEXT: movd %xmm0, %r8d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] -; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: addl %r11d, %eax +; SSE3-NEXT: movd %xmm0, %r11d +; SSE3-NEXT: addl %r8d, %r11d ; SSE3-NEXT: movd %edi, %xmm0 ; SSE3-NEXT: movd %esi, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: movd %r9d, %xmm2 -; SSE3-NEXT: movd %r8d, %xmm0 +; SSE3-NEXT: movd %ecx, %xmm2 +; SSE3-NEXT: movd %eax, %xmm0 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE3-NEXT: movd %eax, %xmm1 -; SSE3-NEXT: movd %edx, %xmm2 +; SSE3-NEXT: movd %r11d, %xmm1 +; SSE3-NEXT: movd %r10d, %xmm2 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE3-NEXT: movd %ecx, %xmm3 -; SSE3-NEXT: movd %r10d, %xmm1 +; SSE3-NEXT: movd %r9d, %xmm3 +; SSE3-NEXT: movd %edx, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE3-NEXT: retq @@ -1247,87 +1247,87 @@ ; SSE3-NEXT: pushq %r12 ; SSE3-NEXT: pushq %rbx ; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: pextrw $1, %xmm0, %r10d -; SSE3-NEXT: addl %eax, %r10d +; SSE3-NEXT: pextrw $1, %xmm0, %edx +; SSE3-NEXT: addl %eax, %edx ; SSE3-NEXT: pextrw $2, %xmm0, %eax -; SSE3-NEXT: pextrw $3, %xmm0, %r11d -; SSE3-NEXT: addl %eax, %r11d +; SSE3-NEXT: pextrw $3, %xmm0, %esi +; SSE3-NEXT: addl %eax, %esi ; SSE3-NEXT: pextrw $4, %xmm0, %eax -; SSE3-NEXT: pextrw $5, %xmm0, %r12d -; SSE3-NEXT: addl %eax, %r12d +; SSE3-NEXT: pextrw $5, %xmm0, %r9d +; SSE3-NEXT: addl %eax, %r9d ; SSE3-NEXT: pextrw $6, %xmm0, %eax -; SSE3-NEXT: pextrw $7, %xmm0, %r13d -; SSE3-NEXT: addl %eax, %r13d -; SSE3-NEXT: movd %xmm1, %eax -; SSE3-NEXT: pextrw $1, %xmm1, %ecx -; SSE3-NEXT: addl %eax, %ecx -; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE3-NEXT: pextrw $2, %xmm1, %eax -; SSE3-NEXT: pextrw $3, %xmm1, %ecx -; SSE3-NEXT: addl %eax, %ecx -; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE3-NEXT: pextrw $4, %xmm1, %eax -; SSE3-NEXT: pextrw $5, %xmm1, %r14d -; SSE3-NEXT: addl %eax, %r14d -; SSE3-NEXT: pextrw $6, %xmm1, %esi -; SSE3-NEXT: pextrw $7, %xmm1, %r15d -; SSE3-NEXT: addl %esi, %r15d -; SSE3-NEXT: movd %xmm2, %esi +; SSE3-NEXT: pextrw $7, %xmm0, %r10d +; SSE3-NEXT: addl %eax, %r10d +; SSE3-NEXT: movd %xmm1, %ecx +; SSE3-NEXT: pextrw $1, %xmm1, %eax +; SSE3-NEXT: addl %ecx, %eax +; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE3-NEXT: pextrw $2, %xmm1, %edi +; SSE3-NEXT: pextrw $3, %xmm1, %eax +; SSE3-NEXT: addl %edi, %eax +; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE3-NEXT: pextrw $4, %xmm1, %r8d +; SSE3-NEXT: pextrw $5, %xmm1, %edi +; SSE3-NEXT: addl %r8d, %edi +; SSE3-NEXT: pextrw $6, %xmm1, %r11d +; SSE3-NEXT: pextrw $7, %xmm1, %r8d +; SSE3-NEXT: addl %r11d, %r8d +; SSE3-NEXT: movd %xmm2, %r11d ; SSE3-NEXT: pextrw $1, %xmm2, %ebp -; SSE3-NEXT: addl %esi, %ebp -; SSE3-NEXT: pextrw $2, %xmm2, %esi -; SSE3-NEXT: pextrw $3, %xmm2, %edi -; SSE3-NEXT: addl %esi, %edi -; SSE3-NEXT: pextrw $4, %xmm2, %esi -; SSE3-NEXT: pextrw $5, %xmm2, %eax -; SSE3-NEXT: addl %esi, %eax -; SSE3-NEXT: pextrw $6, %xmm2, %esi -; SSE3-NEXT: pextrw $7, %xmm2, %ecx -; SSE3-NEXT: addl %esi, %ecx +; SSE3-NEXT: addl %r11d, %ebp +; SSE3-NEXT: pextrw $2, %xmm2, %r11d +; SSE3-NEXT: pextrw $3, %xmm2, %r14d +; SSE3-NEXT: addl %r11d, %r14d +; SSE3-NEXT: pextrw $4, %xmm2, %r11d +; SSE3-NEXT: pextrw $5, %xmm2, %r15d +; SSE3-NEXT: addl %r11d, %r15d +; SSE3-NEXT: pextrw $6, %xmm2, %r11d +; SSE3-NEXT: pextrw $7, %xmm2, %r12d +; SSE3-NEXT: addl %r11d, %r12d ; SSE3-NEXT: movd %xmm3, %ebx -; SSE3-NEXT: pextrw $1, %xmm3, %r9d -; SSE3-NEXT: addl %ebx, %r9d -; SSE3-NEXT: pextrw $2, %xmm3, %edx +; SSE3-NEXT: pextrw $1, %xmm3, %r11d +; SSE3-NEXT: addl %ebx, %r11d +; SSE3-NEXT: pextrw $2, %xmm3, %r13d ; SSE3-NEXT: pextrw $3, %xmm3, %ebx -; SSE3-NEXT: addl %edx, %ebx -; SSE3-NEXT: pextrw $4, %xmm3, %edx -; SSE3-NEXT: pextrw $5, %xmm3, %esi -; SSE3-NEXT: addl %edx, %esi -; SSE3-NEXT: pextrw $6, %xmm3, %r8d -; SSE3-NEXT: pextrw $7, %xmm3, %edx -; SSE3-NEXT: addl %r8d, %edx -; SSE3-NEXT: movd %ecx, %xmm8 -; SSE3-NEXT: movd %eax, %xmm3 -; SSE3-NEXT: movd %edi, %xmm9 +; SSE3-NEXT: addl %r13d, %ebx +; SSE3-NEXT: pextrw $4, %xmm3, %r13d +; SSE3-NEXT: pextrw $5, %xmm3, %ecx +; SSE3-NEXT: addl %r13d, %ecx +; SSE3-NEXT: pextrw $6, %xmm3, %r13d +; SSE3-NEXT: pextrw $7, %xmm3, %eax +; SSE3-NEXT: addl %r13d, %eax +; SSE3-NEXT: movd %r12d, %xmm2 +; SSE3-NEXT: movd %r15d, %xmm3 +; SSE3-NEXT: movd %r14d, %xmm5 ; SSE3-NEXT: movd %ebp, %xmm4 -; SSE3-NEXT: movd %r13d, %xmm10 -; SSE3-NEXT: movd %r12d, %xmm7 -; SSE3-NEXT: movd %r11d, %xmm11 -; SSE3-NEXT: movd %r10d, %xmm0 -; SSE3-NEXT: movd %edx, %xmm12 -; SSE3-NEXT: movd %esi, %xmm6 -; SSE3-NEXT: movd %ebx, %xmm13 -; SSE3-NEXT: movd %r9d, %xmm5 -; SSE3-NEXT: movd %r15d, %xmm14 -; SSE3-NEXT: movd %r14d, %xmm2 +; SSE3-NEXT: movd %r10d, %xmm6 +; SSE3-NEXT: movd %r9d, %xmm7 +; SSE3-NEXT: movd %esi, %xmm8 +; SSE3-NEXT: movd %edx, %xmm0 +; SSE3-NEXT: movd %eax, %xmm9 +; SSE3-NEXT: movd %ecx, %xmm10 +; SSE3-NEXT: movd %ebx, %xmm11 +; SSE3-NEXT: movd %r11d, %xmm12 +; SSE3-NEXT: movd %r8d, %xmm13 +; SSE3-NEXT: movd %edi, %xmm14 ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload ; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload ; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm12[0] ; SSE3-NEXT: popq %rbx ; SSE3-NEXT: popq %r12 ; SSE3-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll --- a/llvm/test/CodeGen/X86/haddsub-4.ll +++ b/llvm/test/CodeGen/X86/haddsub-4.ll @@ -314,20 +314,19 @@ define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind { ; SSE-LABEL: hadd_reverse_v16f32: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm0, %xmm8 ; SSE-NEXT: haddps %xmm3, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,2,1,0] ; SSE-NEXT: haddps %xmm7, %xmm6 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2,1,0] -; SSE-NEXT: haddps %xmm1, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2,1,0] -; SSE-NEXT: haddps %xmm5, %xmm8 +; SSE-NEXT: haddps %xmm1, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,2,1,0] +; SSE-NEXT: haddps %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2,1,0] ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_reverse_v16f32: diff --git a/llvm/test/CodeGen/X86/hoist-invariant-load.ll b/llvm/test/CodeGen/X86/hoist-invariant-load.ll --- a/llvm/test/CodeGen/X86/hoist-invariant-load.ll +++ b/llvm/test/CodeGen/X86/hoist-invariant-load.ll @@ -218,12 +218,12 @@ ; CHECK-NEXT: movq %rdx, %rax ; CHECK-NEXT: xorl %r8d, %r8d ; CHECK-NEXT: movq (%rdi), %rdx -; CHECK-NEXT: movq (%rsi), %r9 +; CHECK-NEXT: movq (%rsi), %rsi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB4_2: ## %for.body ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mulxq %r9, %rsi, %rdi -; CHECK-NEXT: addq %rsi, (%rax) +; CHECK-NEXT: mulxq %rsi, %r9, %rdi +; CHECK-NEXT: addq %r9, (%rax) ; CHECK-NEXT: adcq %rdi, 8(%rax) ; CHECK-NEXT: ## %bb.1: ## %for.check ; CHECK-NEXT: ## in Loop: Header=BB4_2 Depth=1 diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll --- a/llvm/test/CodeGen/X86/i128-mul.ll +++ b/llvm/test/CodeGen/X86/i128-mul.ll @@ -260,20 +260,19 @@ ; X64-NOBMI-NEXT: testq %rdi, %rdi ; X64-NOBMI-NEXT: je .LBB1_3 ; X64-NOBMI-NEXT: # %bb.1: # %for.body.preheader -; X64-NOBMI-NEXT: movq %rcx, %r8 -; X64-NOBMI-NEXT: movq %rdx, %r9 +; X64-NOBMI-NEXT: movq %rdx, %r8 ; X64-NOBMI-NEXT: xorl %r10d, %r10d -; X64-NOBMI-NEXT: xorl %ecx, %ecx +; X64-NOBMI-NEXT: xorl %r9d, %r9d ; X64-NOBMI-NEXT: .p2align 4, 0x90 ; X64-NOBMI-NEXT: .LBB1_2: # %for.body ; X64-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NOBMI-NEXT: movq %r8, %rax -; X64-NOBMI-NEXT: mulq (%r9,%rcx,8) +; X64-NOBMI-NEXT: movq %rcx, %rax +; X64-NOBMI-NEXT: mulq (%r8,%r9,8) ; X64-NOBMI-NEXT: addq %r10, %rax ; X64-NOBMI-NEXT: adcq $0, %rdx -; X64-NOBMI-NEXT: movq %rax, (%rsi,%rcx,8) -; X64-NOBMI-NEXT: incq %rcx -; X64-NOBMI-NEXT: cmpq %rcx, %rdi +; X64-NOBMI-NEXT: movq %rax, (%rsi,%r9,8) +; X64-NOBMI-NEXT: incq %r9 +; X64-NOBMI-NEXT: cmpq %r9, %rdi ; X64-NOBMI-NEXT: movq %rdx, %r10 ; X64-NOBMI-NEXT: jne .LBB1_2 ; X64-NOBMI-NEXT: .LBB1_3: # %for.end @@ -285,21 +284,20 @@ ; X64-BMI-NEXT: testq %rdi, %rdi ; X64-BMI-NEXT: je .LBB1_3 ; X64-BMI-NEXT: # %bb.1: # %for.body.preheader -; X64-BMI-NEXT: movq %rcx, %r8 -; X64-BMI-NEXT: movq %rdx, %r9 -; X64-BMI-NEXT: xorl %r10d, %r10d -; X64-BMI-NEXT: xorl %ecx, %ecx +; X64-BMI-NEXT: movq %rdx, %rax +; X64-BMI-NEXT: xorl %r9d, %r9d +; X64-BMI-NEXT: xorl %r8d, %r8d ; X64-BMI-NEXT: .p2align 4, 0x90 ; X64-BMI-NEXT: .LBB1_2: # %for.body ; X64-BMI-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-BMI-NEXT: movq %r8, %rdx -; X64-BMI-NEXT: mulxq (%r9,%rcx,8), %rax, %rdx -; X64-BMI-NEXT: addq %r10, %rax +; X64-BMI-NEXT: movq %rcx, %rdx +; X64-BMI-NEXT: mulxq (%rax,%r8,8), %r10, %rdx +; X64-BMI-NEXT: addq %r9, %r10 ; X64-BMI-NEXT: adcq $0, %rdx -; X64-BMI-NEXT: movq %rax, (%rsi,%rcx,8) -; X64-BMI-NEXT: incq %rcx -; X64-BMI-NEXT: cmpq %rcx, %rdi -; X64-BMI-NEXT: movq %rdx, %r10 +; X64-BMI-NEXT: movq %r10, (%rsi,%r8,8) +; X64-BMI-NEXT: incq %r8 +; X64-BMI-NEXT: cmpq %r8, %rdi +; X64-BMI-NEXT: movq %rdx, %r9 ; X64-BMI-NEXT: jne .LBB1_2 ; X64-BMI-NEXT: .LBB1_3: # %for.end ; X64-BMI-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/load-local-v3i1.ll b/llvm/test/CodeGen/X86/load-local-v3i1.ll --- a/llvm/test/CodeGen/X86/load-local-v3i1.ll +++ b/llvm/test/CodeGen/X86/load-local-v3i1.ll @@ -93,7 +93,7 @@ ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movq %rdi, %r14 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movzbl (%rdx), %eax ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrb %cl @@ -102,17 +102,17 @@ ; CHECK-NEXT: shrb $2, %dl ; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: movzbl %al, %ebp -; CHECK-NEXT: movzbl %dl, %r15d -; CHECK-NEXT: movzbl %cl, %ebx +; CHECK-NEXT: movzbl %dl, %r14d +; CHECK-NEXT: movzbl %cl, %r15d ; CHECK-NEXT: movq %rsi, %rdi ; CHECK-NEXT: movl %ebp, %esi -; CHECK-NEXT: movl %ebx, %edx -; CHECK-NEXT: movl %r15d, %ecx +; CHECK-NEXT: movl %r15d, %edx +; CHECK-NEXT: movl %r14d, %ecx ; CHECK-NEXT: callq masked_load_v3@PLT -; CHECK-NEXT: movq %r14, %rdi +; CHECK-NEXT: movq %rbx, %rdi ; CHECK-NEXT: movl %ebp, %esi -; CHECK-NEXT: movl %ebx, %edx -; CHECK-NEXT: movl %r15d, %ecx +; CHECK-NEXT: movl %r15d, %edx +; CHECK-NEXT: movl %r14d, %ecx ; CHECK-NEXT: callq masked_store4_v3@PLT ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/lrshrink.ll b/llvm/test/CodeGen/X86/lrshrink.ll --- a/llvm/test/CodeGen/X86/lrshrink.ll +++ b/llvm/test/CodeGen/X86/lrshrink.ll @@ -16,28 +16,28 @@ ; CHECK-NEXT: .cfi_offset %rbx, -32 ; CHECK-NEXT: .cfi_offset %r14, -24 ; CHECK-NEXT: .cfi_offset %r15, -16 -; CHECK-NEXT: movq %rcx, %r14 -; CHECK-NEXT: movl $4, %r15d +; CHECK-NEXT: movq %rcx, %rbx +; CHECK-NEXT: movl $4, %r14d ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: je .LBB0_2 ; CHECK-NEXT: # %bb.1: # %then ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; CHECK-NEXT: movl $10, %r15d +; CHECK-NEXT: movl $10, %r14d ; CHECK-NEXT: movq %rdx, %rsi -; CHECK-NEXT: movq %r8, %r14 +; CHECK-NEXT: movq %r8, %rbx ; CHECK-NEXT: .LBB0_2: # %else -; CHECK-NEXT: addq %r9, %r14 -; CHECK-NEXT: addq %rsi, %r15 -; CHECK-NEXT: callq _Z3foov@PLT -; CHECK-NEXT: movl %eax, %ebx -; CHECK-NEXT: addq %r15, %rbx +; CHECK-NEXT: addq %r9, %rbx +; CHECK-NEXT: addq %rsi, %r14 ; CHECK-NEXT: callq _Z3foov@PLT ; CHECK-NEXT: movl %eax, %r15d -; CHECK-NEXT: addq %rbx, %r15 +; CHECK-NEXT: addq %r14, %r15 +; CHECK-NEXT: callq _Z3foov@PLT +; CHECK-NEXT: movl %eax, %r14d +; CHECK-NEXT: addq %r15, %r14 ; CHECK-NEXT: callq _Z3foov@PLT ; CHECK-NEXT: movl %eax, %eax -; CHECK-NEXT: addq %r15, %rax ; CHECK-NEXT: addq %r14, %rax +; CHECK-NEXT: addq %rbx, %rax ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: popq %r14 diff --git a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll --- a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll +++ b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll @@ -10,82 +10,84 @@ ; GENERIC-LABEL: t: ; GENERIC: ## %bb.0: ## %entry ; GENERIC-NEXT: pushq %rbp +; GENERIC-NEXT: pushq %r15 ; GENERIC-NEXT: pushq %r14 ; GENERIC-NEXT: pushq %rbx ; GENERIC-NEXT: ## kill: def $ecx killed $ecx def $rcx -; GENERIC-NEXT: movl (%rdx), %eax +; GENERIC-NEXT: movl (%rdx), %r8d ; GENERIC-NEXT: movl 4(%rdx), %ebx ; GENERIC-NEXT: decl %ecx -; GENERIC-NEXT: leaq 20(%rdx), %r11 -; GENERIC-NEXT: movq _Te0@GOTPCREL(%rip), %r9 -; GENERIC-NEXT: movq _Te1@GOTPCREL(%rip), %r8 +; GENERIC-NEXT: leaq 20(%rdx), %r9 +; GENERIC-NEXT: movq _Te0@GOTPCREL(%rip), %rdi +; GENERIC-NEXT: movq _Te1@GOTPCREL(%rip), %rax ; GENERIC-NEXT: movq _Te3@GOTPCREL(%rip), %r10 -; GENERIC-NEXT: movq %rcx, %r14 +; GENERIC-NEXT: movq %rcx, %r11 ; GENERIC-NEXT: .p2align 4, 0x90 ; GENERIC-NEXT: LBB0_1: ## %bb ; GENERIC-NEXT: ## =>This Inner Loop Header: Depth=1 -; GENERIC-NEXT: movzbl %al, %edi -; GENERIC-NEXT: ## kill: def $eax killed $eax def $rax -; GENERIC-NEXT: shrl $24, %eax +; GENERIC-NEXT: movzbl %r8b, %r14d +; GENERIC-NEXT: ## kill: def $r8d killed $r8d def $r8 +; GENERIC-NEXT: shrl $24, %r8d ; GENERIC-NEXT: movl %ebx, %ebp ; GENERIC-NEXT: shrl $16, %ebp -; GENERIC-NEXT: movzbl %bpl, %ebp -; GENERIC-NEXT: movl (%r8,%rbp,4), %ebp -; GENERIC-NEXT: xorl (%r9,%rax,4), %ebp -; GENERIC-NEXT: xorl -12(%r11), %ebp +; GENERIC-NEXT: movzbl %bpl, %r15d +; GENERIC-NEXT: movl (%rax,%r15,4), %ebp +; GENERIC-NEXT: xorl (%rdi,%r8,4), %ebp +; GENERIC-NEXT: xorl -12(%r9), %ebp ; GENERIC-NEXT: shrl $24, %ebx -; GENERIC-NEXT: movl (%r10,%rdi,4), %edi -; GENERIC-NEXT: xorl (%r9,%rbx,4), %edi -; GENERIC-NEXT: xorl -8(%r11), %edi -; GENERIC-NEXT: movl %ebp, %eax -; GENERIC-NEXT: shrl $24, %eax -; GENERIC-NEXT: movl (%r9,%rax,4), %eax -; GENERIC-NEXT: subq $1, %r14 +; GENERIC-NEXT: movl (%r10,%r14,4), %r14d +; GENERIC-NEXT: xorl (%rdi,%rbx,4), %r14d +; GENERIC-NEXT: xorl -8(%r9), %r14d +; GENERIC-NEXT: movl %ebp, %r8d +; GENERIC-NEXT: shrl $24, %r8d +; GENERIC-NEXT: movl (%rdi,%r8,4), %r8d +; GENERIC-NEXT: subq $1, %r11 ; GENERIC-NEXT: jb LBB0_3 ; GENERIC-NEXT: ## %bb.2: ## %bb1 ; GENERIC-NEXT: ## in Loop: Header=BB0_1 Depth=1 -; GENERIC-NEXT: movl %edi, %ebx +; GENERIC-NEXT: movl %r14d, %ebx ; GENERIC-NEXT: shrl $16, %ebx ; GENERIC-NEXT: movzbl %bl, %ebx -; GENERIC-NEXT: xorl (%r8,%rbx,4), %eax -; GENERIC-NEXT: xorl -4(%r11), %eax -; GENERIC-NEXT: shrl $24, %edi +; GENERIC-NEXT: xorl (%rax,%rbx,4), %r8d +; GENERIC-NEXT: xorl -4(%r9), %r8d +; GENERIC-NEXT: shrl $24, %r14d ; GENERIC-NEXT: movzbl %bpl, %ebx ; GENERIC-NEXT: movl (%r10,%rbx,4), %ebx -; GENERIC-NEXT: xorl (%r9,%rdi,4), %ebx -; GENERIC-NEXT: xorl (%r11), %ebx -; GENERIC-NEXT: addq $16, %r11 +; GENERIC-NEXT: xorl (%rdi,%r14,4), %ebx +; GENERIC-NEXT: xorl (%r9), %ebx +; GENERIC-NEXT: addq $16, %r9 ; GENERIC-NEXT: jmp LBB0_1 ; GENERIC-NEXT: LBB0_3: ## %bb2 ; GENERIC-NEXT: shlq $4, %rcx -; GENERIC-NEXT: andl $-16777216, %eax ## imm = 0xFF000000 -; GENERIC-NEXT: movl %edi, %ebx -; GENERIC-NEXT: shrl $16, %ebx -; GENERIC-NEXT: movzbl %bl, %ebx -; GENERIC-NEXT: movzbl 2(%r8,%rbx,4), %ebx -; GENERIC-NEXT: shll $16, %ebx -; GENERIC-NEXT: orl %eax, %ebx -; GENERIC-NEXT: xorl 16(%rcx,%rdx), %ebx -; GENERIC-NEXT: shrl $8, %edi -; GENERIC-NEXT: movzbl 3(%r9,%rdi,4), %eax -; GENERIC-NEXT: shll $24, %eax -; GENERIC-NEXT: movzbl %bpl, %edi -; GENERIC-NEXT: movzbl 2(%r8,%rdi,4), %edi -; GENERIC-NEXT: shll $16, %edi -; GENERIC-NEXT: orl %eax, %edi -; GENERIC-NEXT: xorl 20(%rcx,%rdx), %edi -; GENERIC-NEXT: movl %ebx, %eax -; GENERIC-NEXT: shrl $24, %eax -; GENERIC-NEXT: movb %al, (%rsi) -; GENERIC-NEXT: shrl $16, %ebx -; GENERIC-NEXT: movb %bl, 1(%rsi) -; GENERIC-NEXT: movl %edi, %eax -; GENERIC-NEXT: shrl $24, %eax -; GENERIC-NEXT: movb %al, 4(%rsi) -; GENERIC-NEXT: shrl $16, %edi -; GENERIC-NEXT: movb %dil, 5(%rsi) +; GENERIC-NEXT: andl $-16777216, %r8d ## imm = 0xFF000000 +; GENERIC-NEXT: movl %r14d, %r9d +; GENERIC-NEXT: shrl $16, %r9d +; GENERIC-NEXT: movzbl %r9b, %r9d +; GENERIC-NEXT: movzbl 2(%rax,%r9,4), %r9d +; GENERIC-NEXT: shll $16, %r9d +; GENERIC-NEXT: orl %r8d, %r9d +; GENERIC-NEXT: xorl 16(%rcx,%rdx), %r9d +; GENERIC-NEXT: shrl $8, %r14d +; GENERIC-NEXT: movzbl 3(%rdi,%r14,4), %edi +; GENERIC-NEXT: shll $24, %edi +; GENERIC-NEXT: movzbl %bpl, %r8d +; GENERIC-NEXT: movzbl 2(%rax,%r8,4), %eax +; GENERIC-NEXT: shll $16, %eax +; GENERIC-NEXT: orl %edi, %eax +; GENERIC-NEXT: xorl 20(%rcx,%rdx), %eax +; GENERIC-NEXT: movl %r9d, %ecx +; GENERIC-NEXT: shrl $24, %ecx +; GENERIC-NEXT: movb %cl, (%rsi) +; GENERIC-NEXT: shrl $16, %r9d +; GENERIC-NEXT: movb %r9b, 1(%rsi) +; GENERIC-NEXT: movl %eax, %ecx +; GENERIC-NEXT: shrl $24, %ecx +; GENERIC-NEXT: movb %cl, 4(%rsi) +; GENERIC-NEXT: shrl $16, %eax +; GENERIC-NEXT: movb %al, 5(%rsi) ; GENERIC-NEXT: popq %rbx ; GENERIC-NEXT: popq %r14 +; GENERIC-NEXT: popq %r15 ; GENERIC-NEXT: popq %rbp ; GENERIC-NEXT: retq ; @@ -96,76 +98,77 @@ ; ATOM-NEXT: pushq %r14 ; ATOM-NEXT: pushq %rbx ; ATOM-NEXT: ## kill: def $ecx killed $ecx def $rcx -; ATOM-NEXT: movl (%rdx), %r15d -; ATOM-NEXT: movl 4(%rdx), %eax -; ATOM-NEXT: leaq 20(%rdx), %r11 -; ATOM-NEXT: movq _Te0@GOTPCREL(%rip), %r9 -; ATOM-NEXT: movq _Te1@GOTPCREL(%rip), %r8 +; ATOM-NEXT: movl (%rdx), %r8d +; ATOM-NEXT: movl 4(%rdx), %r15d +; ATOM-NEXT: leaq 20(%rdx), %r9 +; ATOM-NEXT: movq _Te0@GOTPCREL(%rip), %rdi +; ATOM-NEXT: movq _Te1@GOTPCREL(%rip), %rax ; ATOM-NEXT: movq _Te3@GOTPCREL(%rip), %r10 ; ATOM-NEXT: decl %ecx -; ATOM-NEXT: movq %rcx, %r14 +; ATOM-NEXT: movq %rcx, %r11 ; ATOM-NEXT: .p2align 4, 0x90 ; ATOM-NEXT: LBB0_1: ## %bb ; ATOM-NEXT: ## =>This Inner Loop Header: Depth=1 -; ATOM-NEXT: movl %eax, %edi -; ATOM-NEXT: movl %r15d, %ebp -; ATOM-NEXT: shrl $24, %eax -; ATOM-NEXT: shrl $16, %edi -; ATOM-NEXT: shrl $24, %ebp -; ATOM-NEXT: movzbl %dil, %edi -; ATOM-NEXT: movl (%r8,%rdi,4), %ebx -; ATOM-NEXT: movzbl %r15b, %edi -; ATOM-NEXT: xorl (%r9,%rbp,4), %ebx -; ATOM-NEXT: movl (%r10,%rdi,4), %edi -; ATOM-NEXT: xorl -12(%r11), %ebx -; ATOM-NEXT: xorl (%r9,%rax,4), %edi -; ATOM-NEXT: movl %ebx, %eax -; ATOM-NEXT: xorl -8(%r11), %edi -; ATOM-NEXT: shrl $24, %eax -; ATOM-NEXT: movl (%r9,%rax,4), %r15d -; ATOM-NEXT: subq $1, %r14 -; ATOM-NEXT: movl %edi, %eax +; ATOM-NEXT: movl %r15d, %ebx +; ATOM-NEXT: movl %r8d, %r14d +; ATOM-NEXT: movzbl %r8b, %r8d +; ATOM-NEXT: shrl $24, %r15d +; ATOM-NEXT: shrl $16, %ebx +; ATOM-NEXT: shrl $24, %r14d +; ATOM-NEXT: movzbl %bl, %ebx +; ATOM-NEXT: movl (%rax,%rbx,4), %ebx +; ATOM-NEXT: xorl (%rdi,%r14,4), %ebx +; ATOM-NEXT: movl (%r10,%r8,4), %r14d +; ATOM-NEXT: xorl -12(%r9), %ebx +; ATOM-NEXT: xorl (%rdi,%r15,4), %r14d +; ATOM-NEXT: movl %ebx, %r8d +; ATOM-NEXT: xorl -8(%r9), %r14d +; ATOM-NEXT: shrl $24, %r8d +; ATOM-NEXT: subq $1, %r11 +; ATOM-NEXT: movl (%rdi,%r8,4), %r8d ; ATOM-NEXT: jb LBB0_3 ; ATOM-NEXT: ## %bb.2: ## %bb1 ; ATOM-NEXT: ## in Loop: Header=BB0_1 Depth=1 -; ATOM-NEXT: shrl $16, %eax -; ATOM-NEXT: shrl $24, %edi -; ATOM-NEXT: movzbl %al, %eax -; ATOM-NEXT: xorl (%r8,%rax,4), %r15d -; ATOM-NEXT: movzbl %bl, %eax -; ATOM-NEXT: movl (%r10,%rax,4), %eax -; ATOM-NEXT: xorl -4(%r11), %r15d -; ATOM-NEXT: xorl (%r9,%rdi,4), %eax -; ATOM-NEXT: xorl (%r11), %eax -; ATOM-NEXT: addq $16, %r11 +; ATOM-NEXT: movl %r14d, %ebp +; ATOM-NEXT: movzbl %bl, %ebx +; ATOM-NEXT: shrl $24, %r14d +; ATOM-NEXT: shrl $16, %ebp +; ATOM-NEXT: movzbl %bpl, %r15d +; ATOM-NEXT: xorl (%rax,%r15,4), %r8d +; ATOM-NEXT: movl (%r10,%rbx,4), %r15d +; ATOM-NEXT: xorl (%rdi,%r14,4), %r15d +; ATOM-NEXT: xorl -4(%r9), %r8d +; ATOM-NEXT: xorl (%r9), %r15d +; ATOM-NEXT: addq $16, %r9 ; ATOM-NEXT: jmp LBB0_1 ; ATOM-NEXT: LBB0_3: ## %bb2 -; ATOM-NEXT: shrl $16, %eax -; ATOM-NEXT: shrl $8, %edi -; ATOM-NEXT: movzbl %bl, %ebp -; ATOM-NEXT: andl $-16777216, %r15d ## imm = 0xFF000000 +; ATOM-NEXT: movl %r14d, %r9d +; ATOM-NEXT: andl $-16777216, %r8d ## imm = 0xFF000000 +; ATOM-NEXT: shrl $8, %r14d ; ATOM-NEXT: shlq $4, %rcx -; ATOM-NEXT: movzbl %al, %eax -; ATOM-NEXT: movzbl 3(%r9,%rdi,4), %edi -; ATOM-NEXT: movzbl 2(%r8,%rbp,4), %ebp -; ATOM-NEXT: movzbl 2(%r8,%rax,4), %eax +; ATOM-NEXT: shrl $16, %r9d +; ATOM-NEXT: movzbl 3(%rdi,%r14,4), %edi +; ATOM-NEXT: movzbl %r9b, %r9d ; ATOM-NEXT: shll $24, %edi -; ATOM-NEXT: shll $16, %ebp +; ATOM-NEXT: movzbl 2(%rax,%r9,4), %r9d +; ATOM-NEXT: shll $16, %r9d +; ATOM-NEXT: orl %r8d, %r9d +; ATOM-NEXT: movzbl %bl, %r8d +; ATOM-NEXT: movzbl 2(%rax,%r8,4), %eax +; ATOM-NEXT: xorl 16(%rcx,%rdx), %r9d ; ATOM-NEXT: shll $16, %eax -; ATOM-NEXT: orl %edi, %ebp -; ATOM-NEXT: orl %r15d, %eax -; ATOM-NEXT: xorl 20(%rcx,%rdx), %ebp -; ATOM-NEXT: xorl 16(%rcx,%rdx), %eax -; ATOM-NEXT: movl %eax, %edi -; ATOM-NEXT: shrl $16, %eax +; ATOM-NEXT: orl %edi, %eax +; ATOM-NEXT: movl %r9d, %edi +; ATOM-NEXT: shrl $16, %r9d +; ATOM-NEXT: xorl 20(%rcx,%rdx), %eax ; ATOM-NEXT: shrl $24, %edi +; ATOM-NEXT: movl %eax, %ecx +; ATOM-NEXT: shrl $16, %eax ; ATOM-NEXT: movb %dil, (%rsi) -; ATOM-NEXT: movb %al, 1(%rsi) -; ATOM-NEXT: movl %ebp, %eax -; ATOM-NEXT: shrl $16, %ebp -; ATOM-NEXT: shrl $24, %eax -; ATOM-NEXT: movb %al, 4(%rsi) -; ATOM-NEXT: movb %bpl, 5(%rsi) +; ATOM-NEXT: movb %r9b, 1(%rsi) +; ATOM-NEXT: shrl $24, %ecx +; ATOM-NEXT: movb %cl, 4(%rsi) +; ATOM-NEXT: movb %al, 5(%rsi) ; ATOM-NEXT: popq %rbx ; ATOM-NEXT: popq %r14 ; ATOM-NEXT: popq %r15 diff --git a/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll --- a/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll +++ b/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll @@ -236,16 +236,16 @@ ; NOFASTLZCNT-LABEL: test_zext_cmp8: ; NOFASTLZCNT: # %bb.0: # %entry ; NOFASTLZCNT-NEXT: testl %edi, %edi -; NOFASTLZCNT-NEXT: sete %dil -; NOFASTLZCNT-NEXT: testl %esi, %esi ; NOFASTLZCNT-NEXT: sete %al -; NOFASTLZCNT-NEXT: orb %dil, %al +; NOFASTLZCNT-NEXT: testl %esi, %esi +; NOFASTLZCNT-NEXT: sete %sil +; NOFASTLZCNT-NEXT: orb %al, %sil ; NOFASTLZCNT-NEXT: testl %edx, %edx -; NOFASTLZCNT-NEXT: sete %dl +; NOFASTLZCNT-NEXT: sete %al ; NOFASTLZCNT-NEXT: testl %ecx, %ecx ; NOFASTLZCNT-NEXT: sete %cl -; NOFASTLZCNT-NEXT: orb %dl, %cl ; NOFASTLZCNT-NEXT: orb %al, %cl +; NOFASTLZCNT-NEXT: orb %sil, %cl ; NOFASTLZCNT-NEXT: movzbl %cl, %eax ; NOFASTLZCNT-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll --- a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll +++ b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll @@ -1012,63 +1012,63 @@ ; SSE: # %bb.0: ; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: paddq %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm8, %xmm2 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pxor %xmm2, %xmm8 ; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: pcmpgtd %xmm2, %xmm9 +; SSE-NEXT: pcmpgtd %xmm8, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pcmpeqd %xmm3, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE-NEXT: pand %xmm10, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: por %xmm8, %xmm3 ; SSE-NEXT: pand %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pxor %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pxor %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 -; SSE-NEXT: pxor %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 @@ -1218,63 +1218,63 @@ ; SSE: # %bb.0: ; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: paddq %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm8, %xmm2 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pxor %xmm2, %xmm8 ; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: pcmpgtd %xmm2, %xmm9 +; SSE-NEXT: pcmpgtd %xmm8, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pcmpeqd %xmm3, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE-NEXT: pand %xmm10, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: por %xmm8, %xmm3 ; SSE-NEXT: pand %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pxor %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pxor %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 -; SSE-NEXT: pxor %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 @@ -1425,63 +1425,63 @@ ; SSE: # %bb.0: ; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: paddq %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pxor %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm9 ; SSE-NEXT: pcmpgtd %xmm3, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pcmpeqd %xmm3, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE-NEXT: pand %xmm10, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: por %xmm8, %xmm3 ; SSE-NEXT: pand %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pxor %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pxor %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 -; SSE-NEXT: pxor %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pcmpgtd %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 @@ -1631,63 +1631,63 @@ ; SSE: # %bb.0: ; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: paddq %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pxor %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm9 ; SSE-NEXT: pcmpgtd %xmm3, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pcmpeqd %xmm3, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE-NEXT: pand %xmm10, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: por %xmm8, %xmm3 ; SSE-NEXT: pand %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pxor %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pxor %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm1 -; SSE-NEXT: pxor %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pcmpgtd %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 @@ -1947,13 +1947,13 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] -; SSE-NEXT: por %xmm6, %xmm9 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pxor %xmm4, %xmm6 @@ -1963,106 +1963,106 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pxor %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm8 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE-NEXT: pcmpeqd %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm6 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pxor %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE-NEXT: pcmpeqd %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm9 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pcmpgtd %xmm2, %xmm7 +; SSE-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE-NEXT: pxor %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pcmpgtd %xmm3, %xmm7 ; SSE-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_umax_v8i64: @@ -2328,13 +2328,13 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] -; SSE-NEXT: por %xmm6, %xmm9 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pxor %xmm4, %xmm6 @@ -2344,106 +2344,106 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pxor %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm8 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE-NEXT: pcmpeqd %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm6 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pxor %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE-NEXT: pcmpeqd %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm9 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pcmpgtd %xmm2, %xmm7 +; SSE-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE-NEXT: pxor %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pcmpgtd %xmm3, %xmm7 ; SSE-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_smax_v8i64: @@ -2712,13 +2712,13 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] -; SSE-NEXT: por %xmm6, %xmm9 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pxor %xmm4, %xmm6 @@ -2728,106 +2728,106 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pxor %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pcmpgtd %xmm2, %xmm7 +; SSE-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm8 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE-NEXT: pcmpeqd %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm6 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pxor %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE-NEXT: pcmpeqd %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm9 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE-NEXT: pxor %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pcmpgtd %xmm4, %xmm7 ; SSE-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_umin_v8i64: @@ -3093,13 +3093,13 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] -; SSE-NEXT: por %xmm6, %xmm9 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pxor %xmm4, %xmm6 @@ -3109,106 +3109,106 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pxor %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pcmpgtd %xmm2, %xmm7 +; SSE-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm8 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE-NEXT: pcmpeqd %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm6 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pxor %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE-NEXT: pcmpeqd %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm9 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE-NEXT: pxor %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pcmpgtd %xmm4, %xmm7 ; SSE-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_smin_v8i64: diff --git a/llvm/test/CodeGen/X86/machine-cp.ll b/llvm/test/CodeGen/X86/machine-cp.ll --- a/llvm/test/CodeGen/X86/machine-cp.ll +++ b/llvm/test/CodeGen/X86/machine-cp.ll @@ -100,29 +100,29 @@ ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ## %bb ; CHECK-NEXT: movaps %xmm3, %xmm9 -; CHECK-NEXT: movaps %xmm2, %xmm8 +; CHECK-NEXT: movaps %xmm2, %xmm5 ; CHECK-NEXT: movaps %xmm0, %xmm7 ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: movaps %xmm3, %xmm2 ; CHECK-NEXT: cmpltps %xmm0, %xmm2 ; CHECK-NEXT: movaps %xmm2, %xmm4 ; CHECK-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; CHECK-NEXT: movaps %xmm4, %xmm10 -; CHECK-NEXT: andnps %xmm2, %xmm10 -; CHECK-NEXT: movaps %xmm8, %xmm5 -; CHECK-NEXT: cmpltps %xmm0, %xmm5 +; CHECK-NEXT: movaps %xmm4, %xmm8 +; CHECK-NEXT: andnps %xmm2, %xmm8 +; CHECK-NEXT: movaps %xmm5, %xmm6 +; CHECK-NEXT: cmpltps %xmm0, %xmm6 ; CHECK-NEXT: movaps {{.*#+}} xmm11 = [9,10,11,12] -; CHECK-NEXT: movaps %xmm5, %xmm2 +; CHECK-NEXT: movaps %xmm6, %xmm2 ; CHECK-NEXT: orps %xmm11, %xmm2 -; CHECK-NEXT: movaps %xmm2, %xmm14 -; CHECK-NEXT: andnps %xmm5, %xmm14 +; CHECK-NEXT: movaps %xmm2, %xmm10 +; CHECK-NEXT: andnps %xmm6, %xmm10 ; CHECK-NEXT: cvttps2dq %xmm1, %xmm12 ; CHECK-NEXT: cmpltps %xmm0, %xmm1 ; CHECK-NEXT: movaps {{.*#+}} xmm13 = [5,6,7,8] ; CHECK-NEXT: movaps %xmm1, %xmm6 ; CHECK-NEXT: orps %xmm13, %xmm6 -; CHECK-NEXT: movaps %xmm6, %xmm5 -; CHECK-NEXT: andnps %xmm1, %xmm5 +; CHECK-NEXT: movaps %xmm6, %xmm14 +; CHECK-NEXT: andnps %xmm1, %xmm14 ; CHECK-NEXT: cvttps2dq %xmm7, %xmm3 ; CHECK-NEXT: cmpltps %xmm0, %xmm7 ; CHECK-NEXT: movaps {{.*#+}} xmm15 = [1,2,3,4] @@ -139,20 +139,20 @@ ; CHECK-NEXT: andps %xmm13, %xmm6 ; CHECK-NEXT: cvtdq2ps %xmm12, %xmm1 ; CHECK-NEXT: andps %xmm1, %xmm6 -; CHECK-NEXT: andps %xmm3, %xmm5 -; CHECK-NEXT: orps %xmm5, %xmm6 +; CHECK-NEXT: andps %xmm3, %xmm14 +; CHECK-NEXT: orps %xmm14, %xmm6 ; CHECK-NEXT: andps %xmm11, %xmm2 -; CHECK-NEXT: cvttps2dq %xmm8, %xmm1 +; CHECK-NEXT: cvttps2dq %xmm5, %xmm1 ; CHECK-NEXT: cvtdq2ps %xmm1, %xmm1 ; CHECK-NEXT: andps %xmm1, %xmm2 -; CHECK-NEXT: andps %xmm3, %xmm14 -; CHECK-NEXT: orps %xmm14, %xmm2 ; CHECK-NEXT: andps %xmm3, %xmm10 +; CHECK-NEXT: orps %xmm10, %xmm2 +; CHECK-NEXT: andps %xmm3, %xmm8 ; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; CHECK-NEXT: cvttps2dq %xmm9, %xmm1 ; CHECK-NEXT: cvtdq2ps %xmm1, %xmm1 ; CHECK-NEXT: andps %xmm1, %xmm4 -; CHECK-NEXT: orps %xmm10, %xmm4 +; CHECK-NEXT: orps %xmm8, %xmm4 ; CHECK-NEXT: movaps %xmm6, %xmm1 ; CHECK-NEXT: movaps %xmm4, %xmm3 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -350,7 +350,7 @@ ; SSE2-LABEL: _Z10test_shortPsS_i_1024: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm4 @@ -362,28 +362,28 @@ ; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm5 ; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm6 ; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm7 -; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm9 -; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm0 -; SSE2-NEXT: pmaddwd %xmm5, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm0 -; SSE2-NEXT: pmaddwd %xmm6, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm0 -; SSE2-NEXT: pmaddwd %xmm7, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm0 -; SSE2-NEXT: pmaddwd %xmm9, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm8 +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm9 +; SSE2-NEXT: pmaddwd %xmm5, %xmm9 +; SSE2-NEXT: paddd %xmm9, %xmm2 +; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm5 +; SSE2-NEXT: pmaddwd %xmm6, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm4 +; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm5 +; SSE2-NEXT: pmaddwd %xmm7, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm5 +; SSE2-NEXT: pmaddwd %xmm8, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm3 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd %xmm8, %xmm4 -; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: paddd %xmm0, %xmm4 +; SSE2-NEXT: paddd %xmm0, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm8, %xmm2 -; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] @@ -934,7 +934,7 @@ ; SSE2-LABEL: _Z9test_charPcS_i_1024: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm4 @@ -944,42 +944,42 @@ ; SSE2-NEXT: .LBB7_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm7 -; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm10 -; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm0 -; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm6 +; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm8 +; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; SSE2-NEXT: psraw $8, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; SSE2-NEXT: psraw $8, %xmm10 +; SSE2-NEXT: pmaddwd %xmm9, %xmm10 +; SSE2-NEXT: paddd %xmm10, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psraw $8, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psraw $8, %xmm8 +; SSE2-NEXT: pmaddwd %xmm7, %xmm8 +; SSE2-NEXT: paddd %xmm8, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE2-NEXT: psraw $8, %xmm7 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; SSE2-NEXT: psraw $8, %xmm8 +; SSE2-NEXT: pmaddwd %xmm7, %xmm8 +; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: pmaddwd %xmm5, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: pmaddwd %xmm5, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: pmaddwd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: pmaddwd %xmm0, %xmm5 +; SSE2-NEXT: pmaddwd %xmm6, %xmm5 ; SSE2-NEXT: paddd %xmm5, %xmm3 ; SSE2-NEXT: addq $32, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB7_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd %xmm8, %xmm4 -; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: paddd %xmm0, %xmm4 +; SSE2-NEXT: paddd %xmm0, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm8, %xmm2 -; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] @@ -1388,20 +1388,20 @@ ; SSE2-NEXT: .LBB10_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm4 -; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm8 +; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm5 ; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm6 ; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm7 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pmulhuw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pmulhuw %xmm4, %xmm8 ; SSE2-NEXT: pmullw %xmm4, %xmm6 ; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] ; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] ; SSE2-NEXT: paddd %xmm6, %xmm1 ; SSE2-NEXT: movdqa %xmm7, %xmm4 -; SSE2-NEXT: pmulhuw %xmm8, %xmm4 -; SSE2-NEXT: pmullw %xmm8, %xmm7 +; SSE2-NEXT: pmulhuw %xmm5, %xmm4 +; SSE2-NEXT: pmullw %xmm5, %xmm7 ; SSE2-NEXT: movdqa %xmm7, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; SSE2-NEXT: paddd %xmm5, %xmm3 @@ -1564,11 +1564,11 @@ ; SSE2-LABEL: test_unsigned_short_1024: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pxor %xmm5, %xmm5 @@ -1576,59 +1576,59 @@ ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB11_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm0 -; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pmulhuw %xmm0, %xmm2 -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pmulhuw %xmm0, %xmm1 -; SSE2-NEXT: pmullw %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pmulhuw %xmm0, %xmm2 -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pmulhuw %xmm0, %xmm1 -; SSE2-NEXT: pmullw %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: paddd %xmm2, %xmm10 +; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm8 +; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pmulhuw %xmm8, %xmm10 +; SSE2-NEXT: pmullw %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; SSE2-NEXT: paddd %xmm8, %xmm7 +; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm10 +; SSE2-NEXT: paddd %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm10, %xmm9 +; SSE2-NEXT: pmulhuw %xmm8, %xmm9 +; SSE2-NEXT: pmullw %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; SSE2-NEXT: paddd %xmm8, %xmm6 +; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm9 +; SSE2-NEXT: paddd %xmm10, %xmm4 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pmulhuw %xmm8, %xmm10 +; SSE2-NEXT: pmullw %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSE2-NEXT: paddd %xmm8, %xmm0 +; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm10 +; SSE2-NEXT: paddd %xmm9, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm9 +; SSE2-NEXT: pmulhuw %xmm8, %xmm9 +; SSE2-NEXT: pmullw %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE2-NEXT: paddd %xmm10, %xmm2 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB11_1 ; SSE2-NEXT: # %bb.2: # %middle.block ; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: paddd %xmm7, %xmm10 -; SSE2-NEXT: paddd %xmm3, %xmm10 -; SSE2-NEXT: paddd %xmm4, %xmm8 -; SSE2-NEXT: paddd %xmm5, %xmm9 -; SSE2-NEXT: paddd %xmm10, %xmm9 -; SSE2-NEXT: paddd %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] -; SSE2-NEXT: paddd %xmm9, %xmm0 +; SSE2-NEXT: paddd %xmm7, %xmm2 +; SSE2-NEXT: paddd %xmm3, %xmm2 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -1637,10 +1637,10 @@ ; AVX1-LABEL: test_unsigned_short_1024: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: xorl %ecx, %ecx ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB11_1: # %vector.body @@ -1649,57 +1649,57 @@ ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm9 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm10 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm11 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmulld %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmulld %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmulld %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmulld %xmm0, %xmm7, %xmm13 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmulld %xmm12, %xmm7, %xmm7 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmulld %xmm10, %xmm0, %xmm10 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmulld %xmm11, %xmm0, %xmm11 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8 -; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm0 -; AVX1-NEXT: vpaddd %xmm7, %xmm9, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm10, %xmm0 -; AVX1-NEXT: vpaddd %xmm3, %xmm11, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmulld %xmm4, %xmm12, %xmm4 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmulld %xmm5, %xmm12, %xmm5 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmulld %xmm6, %xmm12, %xmm6 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmulld %xmm7, %xmm12, %xmm7 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmulld %xmm8, %xmm12, %xmm8 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmulld %xmm9, %xmm12, %xmm9 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm10 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmulld %xmm11, %xmm12, %xmm11 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm12 +; AVX1-NEXT: vpaddd %xmm4, %xmm12, %xmm4 +; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm4 +; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm11, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB11_1 ; AVX1-NEXT: # %bb.2: # %middle.block -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm9, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -2220,8 +2220,8 @@ ; AVX1-NEXT: vpmaddwd %xmm8, %xmm9, %xmm8 ; AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8 ; AVX1-NEXT: vpmaddwd %xmm4, %xmm8, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 @@ -2653,8 +2653,8 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %arg2, <8 x i16>* %arg3, <8 x i16>* %arg4, <8 x i16>* %arg5, <8 x i16>* %arg6, <8 x i16>* %arg7) { ; SSE2-LABEL: madd_quad_reduction: ; SSE2: # %bb.0: -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE2-NEXT: movdqu (%rdi), %xmm0 ; SSE2-NEXT: movdqu (%rsi), %xmm1 ; SSE2-NEXT: pmaddwd %xmm0, %xmm1 @@ -2665,8 +2665,8 @@ ; SSE2-NEXT: movdqu (%r9), %xmm3 ; SSE2-NEXT: pmaddwd %xmm0, %xmm3 ; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: movdqu (%rax), %xmm0 -; SSE2-NEXT: movdqu (%r10), %xmm1 +; SSE2-NEXT: movdqu (%r10), %xmm0 +; SSE2-NEXT: movdqu (%rax), %xmm1 ; SSE2-NEXT: pmaddwd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 @@ -2679,8 +2679,8 @@ ; ; AVX-LABEL: madd_quad_reduction: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: vmovdqu (%rdi), %xmm0 ; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu (%rdx), %xmm1 @@ -2688,8 +2688,8 @@ ; AVX-NEXT: vmovdqu (%r8), %xmm2 ; AVX-NEXT: vpmaddwd (%r9), %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%rax), %xmm2 -; AVX-NEXT: vpmaddwd (%r10), %xmm2, %xmm2 +; AVX-NEXT: vmovdqu (%r10), %xmm2 +; AVX-NEXT: vpmaddwd (%rax), %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] diff --git a/llvm/test/CodeGen/X86/masked-iv-unsafe.ll b/llvm/test/CodeGen/X86/masked-iv-unsafe.ll --- a/llvm/test/CodeGen/X86/masked-iv-unsafe.ll +++ b/llvm/test/CodeGen/X86/masked-iv-unsafe.ll @@ -341,7 +341,7 @@ define void @another_count_up_signed(ptr %d, i64 %n) nounwind { ; CHECK-LABEL: another_count_up_signed: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %r8d, %r8d +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero @@ -350,22 +350,22 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB6_1: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %r8, %rax -; CHECK-NEXT: sarq $8, %rax +; CHECK-NEXT: movq %rax, %r8 +; CHECK-NEXT: sarq $8, %r8 ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm0, %xmm3 -; CHECK-NEXT: movsd %xmm3, (%rdi,%rax,8) -; CHECK-NEXT: movq %rcx, %rax -; CHECK-NEXT: sarq $24, %rax +; CHECK-NEXT: movsd %xmm3, (%rdi,%r8,8) +; CHECK-NEXT: movq %rcx, %r8 +; CHECK-NEXT: sarq $24, %r8 ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm1, %xmm3 -; CHECK-NEXT: movsd %xmm3, (%rdi,%rax,8) +; CHECK-NEXT: movsd %xmm3, (%rdi,%r8,8) ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm2, %xmm3 ; CHECK-NEXT: movsd %xmm3, (%rdx) ; CHECK-NEXT: addq $8, %rdx ; CHECK-NEXT: addq $16777216, %rcx # imm = 0x1000000 -; CHECK-NEXT: addq $256, %r8 # imm = 0x100 +; CHECK-NEXT: addq $256, %rax # imm = 0x100 ; CHECK-NEXT: decq %rsi ; CHECK-NEXT: jne .LBB6_1 ; CHECK-NEXT: # %bb.2: # %return diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -1876,12 +1876,12 @@ ; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 ; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5 -; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5 +; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8 +; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4 -; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vpackssdw %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vpacksswb %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpmovmskb %xmm4, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4 diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -2024,12 +2024,12 @@ ; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 ; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5 -; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5 +; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8 +; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4 -; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vpackssdw %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vpacksswb %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpmovmskb %xmm4, %ecx ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4 diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -593,13 +593,13 @@ ; SSE-LABEL: gather_v16i8_v16i32_v16i8: ; SSE: # %bb.0: ; SSE-NEXT: movq %rdi, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] ; SSE-NEXT: pmovsxdq %xmm0, %xmm0 -; SSE-NEXT: paddq %xmm8, %xmm0 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: pcmpeqb %xmm4, %xmm6 -; SSE-NEXT: pmovmskb %xmm6, %eax +; SSE-NEXT: paddq %xmm6, %xmm0 +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: pcmpeqb %xmm4, %xmm8 +; SSE-NEXT: pmovmskb %xmm8, %eax ; SSE-NEXT: testb $1, %al ; SSE-NEXT: je .LBB3_2 ; SSE-NEXT: # %bb.1: # %cond.load @@ -613,7 +613,7 @@ ; SSE-NEXT: pextrq $1, %xmm0, %rcx ; SSE-NEXT: pinsrb $1, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_4: # %else2 -; SSE-NEXT: paddq %xmm8, %xmm4 +; SSE-NEXT: paddq %xmm6, %xmm4 ; SSE-NEXT: testb $4, %al ; SSE-NEXT: je .LBB3_6 ; SSE-NEXT: # %bb.5: # %cond.load4 @@ -628,7 +628,7 @@ ; SSE-NEXT: pinsrb $3, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_8: # %else8 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: paddq %xmm8, %xmm0 +; SSE-NEXT: paddq %xmm6, %xmm0 ; SSE-NEXT: testb $16, %al ; SSE-NEXT: je .LBB3_10 ; SSE-NEXT: # %bb.9: # %cond.load10 @@ -642,7 +642,7 @@ ; SSE-NEXT: pextrq $1, %xmm0, %rcx ; SSE-NEXT: pinsrb $5, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_12: # %else14 -; SSE-NEXT: paddq %xmm8, %xmm1 +; SSE-NEXT: paddq %xmm6, %xmm1 ; SSE-NEXT: testb $64, %al ; SSE-NEXT: je .LBB3_14 ; SSE-NEXT: # %bb.13: # %cond.load16 @@ -657,7 +657,7 @@ ; SSE-NEXT: pinsrb $7, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_16: # %else20 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE-NEXT: paddq %xmm8, %xmm0 +; SSE-NEXT: paddq %xmm6, %xmm0 ; SSE-NEXT: testl $256, %eax # imm = 0x100 ; SSE-NEXT: je .LBB3_18 ; SSE-NEXT: # %bb.17: # %cond.load22 @@ -671,7 +671,7 @@ ; SSE-NEXT: pextrq $1, %xmm0, %rcx ; SSE-NEXT: pinsrb $9, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_20: # %else26 -; SSE-NEXT: paddq %xmm8, %xmm1 +; SSE-NEXT: paddq %xmm6, %xmm1 ; SSE-NEXT: testl $1024, %eax # imm = 0x400 ; SSE-NEXT: je .LBB3_22 ; SSE-NEXT: # %bb.21: # %cond.load28 @@ -686,7 +686,7 @@ ; SSE-NEXT: pinsrb $11, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_24: # %else32 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; SSE-NEXT: paddq %xmm8, %xmm0 +; SSE-NEXT: paddq %xmm6, %xmm0 ; SSE-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE-NEXT: je .LBB3_26 ; SSE-NEXT: # %bb.25: # %cond.load34 @@ -700,7 +700,7 @@ ; SSE-NEXT: pextrq $1, %xmm0, %rcx ; SSE-NEXT: pinsrb $13, (%rcx), %xmm5 ; SSE-NEXT: .LBB3_28: # %else38 -; SSE-NEXT: paddq %xmm1, %xmm8 +; SSE-NEXT: paddq %xmm1, %xmm6 ; SSE-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE-NEXT: jne .LBB3_29 ; SSE-NEXT: # %bb.30: # %else41 @@ -710,12 +710,12 @@ ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: retq ; SSE-NEXT: .LBB3_29: # %cond.load40 -; SSE-NEXT: movq %xmm8, %rcx +; SSE-NEXT: movq %xmm6, %rcx ; SSE-NEXT: pinsrb $14, (%rcx), %xmm5 ; SSE-NEXT: testl $32768, %eax # imm = 0x8000 ; SSE-NEXT: je .LBB3_32 ; SSE-NEXT: .LBB3_31: # %cond.load43 -; SSE-NEXT: pextrq $1, %xmm8, %rax +; SSE-NEXT: pextrq $1, %xmm6, %rax ; SSE-NEXT: pinsrb $15, (%rax), %xmm5 ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -576,20 +576,18 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x double> %dst) { ; SSE2-LABEL: load_v8f64_v8i64: ; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: movaps %xmm6, %xmm9 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,0,3,2] -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm9 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: packssdw %xmm6, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm1 +; SSE2-NEXT: packssdw %xmm9, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: packssdw %xmm2, %xmm1 @@ -620,12 +618,12 @@ ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB6_16 ; SSE2-NEXT: LBB6_15: ## %cond.load19 -; SSE2-NEXT: movhps {{.*#+}} xmm8 = xmm8[0,1],mem[0,1] +; SSE2-NEXT: movhps {{.*#+}} xmm7 = xmm7[0,1],mem[0,1] ; SSE2-NEXT: LBB6_16: ## %else20 ; SSE2-NEXT: movaps %xmm4, %xmm0 ; SSE2-NEXT: movaps %xmm5, %xmm1 -; SSE2-NEXT: movaps %xmm9, %xmm2 -; SSE2-NEXT: movaps %xmm8, %xmm3 +; SSE2-NEXT: movaps %xmm6, %xmm2 +; SSE2-NEXT: movaps %xmm7, %xmm3 ; SSE2-NEXT: retq ; SSE2-NEXT: LBB6_1: ## %cond.load ; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] @@ -644,28 +642,27 @@ ; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB6_10 ; SSE2-NEXT: LBB6_9: ## %cond.load10 -; SSE2-NEXT: movlps {{.*#+}} xmm9 = mem[0,1],xmm9[2,3] +; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] ; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je LBB6_12 ; SSE2-NEXT: LBB6_11: ## %cond.load13 -; SSE2-NEXT: movhps {{.*#+}} xmm9 = xmm9[0,1],mem[0,1] +; SSE2-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1] ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB6_14 ; SSE2-NEXT: LBB6_13: ## %cond.load16 -; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3] +; SSE2-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: jne LBB6_15 ; SSE2-NEXT: jmp LBB6_16 ; ; SSE42-LABEL: load_v8f64_v8i64: ; SSE42: ## %bb.0: -; SSE42-NEXT: movdqa %xmm7, %xmm8 -; SSE42-NEXT: pxor %xmm7, %xmm7 -; SSE42-NEXT: pcmpeqq %xmm7, %xmm3 -; SSE42-NEXT: pcmpeqq %xmm7, %xmm2 +; SSE42-NEXT: pxor %xmm8, %xmm8 +; SSE42-NEXT: pcmpeqq %xmm8, %xmm3 +; SSE42-NEXT: pcmpeqq %xmm8, %xmm2 ; SSE42-NEXT: packssdw %xmm3, %xmm2 -; SSE42-NEXT: pcmpeqq %xmm7, %xmm1 -; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 +; SSE42-NEXT: pcmpeqq %xmm8, %xmm1 +; SSE42-NEXT: pcmpeqq %xmm8, %xmm0 ; SSE42-NEXT: packssdw %xmm1, %xmm0 ; SSE42-NEXT: packssdw %xmm2, %xmm0 ; SSE42-NEXT: packsswb %xmm0, %xmm0 @@ -694,12 +691,12 @@ ; SSE42-NEXT: testb $-128, %al ; SSE42-NEXT: je LBB6_16 ; SSE42-NEXT: LBB6_15: ## %cond.load19 -; SSE42-NEXT: movhps {{.*#+}} xmm8 = xmm8[0,1],mem[0,1] +; SSE42-NEXT: movhps {{.*#+}} xmm7 = xmm7[0,1],mem[0,1] ; SSE42-NEXT: LBB6_16: ## %else20 ; SSE42-NEXT: movaps %xmm4, %xmm0 ; SSE42-NEXT: movaps %xmm5, %xmm1 ; SSE42-NEXT: movaps %xmm6, %xmm2 -; SSE42-NEXT: movaps %xmm8, %xmm3 +; SSE42-NEXT: movaps %xmm7, %xmm3 ; SSE42-NEXT: retq ; SSE42-NEXT: LBB6_1: ## %cond.load ; SSE42-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] @@ -726,7 +723,7 @@ ; SSE42-NEXT: testb $64, %al ; SSE42-NEXT: je LBB6_14 ; SSE42-NEXT: LBB6_13: ## %cond.load16 -; SSE42-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3] +; SSE42-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] ; SSE42-NEXT: testb $-128, %al ; SSE42-NEXT: jne LBB6_15 ; SSE42-NEXT: jmp LBB6_16 @@ -1977,20 +1974,18 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x i64> %dst) { ; SSE2-LABEL: load_v8i64_v8i64: ; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: movaps %xmm6, %xmm9 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,0,3,2] -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm9 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: packssdw %xmm6, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm1 +; SSE2-NEXT: packssdw %xmm9, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: packssdw %xmm2, %xmm1 @@ -2021,13 +2016,13 @@ ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB16_16 ; SSE2-NEXT: LBB16_15: ## %cond.load19 -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] ; SSE2-NEXT: LBB16_16: ## %else20 ; SSE2-NEXT: movaps %xmm4, %xmm0 ; SSE2-NEXT: movaps %xmm5, %xmm1 -; SSE2-NEXT: movaps %xmm9, %xmm2 -; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: movaps %xmm6, %xmm2 +; SSE2-NEXT: movaps %xmm7, %xmm3 ; SSE2-NEXT: retq ; SSE2-NEXT: LBB16_1: ## %cond.load ; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] @@ -2048,29 +2043,28 @@ ; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB16_10 ; SSE2-NEXT: LBB16_9: ## %cond.load10 -; SSE2-NEXT: movlps {{.*#+}} xmm9 = mem[0,1],xmm9[2,3] +; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] ; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je LBB16_12 ; SSE2-NEXT: LBB16_11: ## %cond.load13 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB16_14 ; SSE2-NEXT: LBB16_13: ## %cond.load16 -; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3] +; SSE2-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: jne LBB16_15 ; SSE2-NEXT: jmp LBB16_16 ; ; SSE42-LABEL: load_v8i64_v8i64: ; SSE42: ## %bb.0: -; SSE42-NEXT: movdqa %xmm7, %xmm8 -; SSE42-NEXT: pxor %xmm7, %xmm7 -; SSE42-NEXT: pcmpeqq %xmm7, %xmm3 -; SSE42-NEXT: pcmpeqq %xmm7, %xmm2 +; SSE42-NEXT: pxor %xmm8, %xmm8 +; SSE42-NEXT: pcmpeqq %xmm8, %xmm3 +; SSE42-NEXT: pcmpeqq %xmm8, %xmm2 ; SSE42-NEXT: packssdw %xmm3, %xmm2 -; SSE42-NEXT: pcmpeqq %xmm7, %xmm1 -; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 +; SSE42-NEXT: pcmpeqq %xmm8, %xmm1 +; SSE42-NEXT: pcmpeqq %xmm8, %xmm0 ; SSE42-NEXT: packssdw %xmm1, %xmm0 ; SSE42-NEXT: packssdw %xmm2, %xmm0 ; SSE42-NEXT: packsswb %xmm0, %xmm0 @@ -2099,12 +2093,12 @@ ; SSE42-NEXT: testb $-128, %al ; SSE42-NEXT: je LBB16_16 ; SSE42-NEXT: LBB16_15: ## %cond.load19 -; SSE42-NEXT: pinsrq $1, 56(%rdi), %xmm8 +; SSE42-NEXT: pinsrq $1, 56(%rdi), %xmm7 ; SSE42-NEXT: LBB16_16: ## %else20 ; SSE42-NEXT: movdqa %xmm4, %xmm0 ; SSE42-NEXT: movdqa %xmm5, %xmm1 ; SSE42-NEXT: movdqa %xmm6, %xmm2 -; SSE42-NEXT: movdqa %xmm8, %xmm3 +; SSE42-NEXT: movdqa %xmm7, %xmm3 ; SSE42-NEXT: retq ; SSE42-NEXT: LBB16_1: ## %cond.load ; SSE42-NEXT: pinsrq $0, (%rdi), %xmm4 @@ -2131,7 +2125,7 @@ ; SSE42-NEXT: testb $64, %al ; SSE42-NEXT: je LBB16_14 ; SSE42-NEXT: LBB16_13: ## %cond.load16 -; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm8 +; SSE42-NEXT: pinsrq $0, 48(%rdi), %xmm7 ; SSE42-NEXT: testb $-128, %al ; SSE42-NEXT: jne LBB16_15 ; SSE42-NEXT: jmp LBB16_16 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -11,108 +11,108 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm7, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647] -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm12, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm11, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm10 -; SSE2-NEXT: pand %xmm10, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm10 -; SSE2-NEXT: por %xmm2, %xmm10 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm12, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm7 -; SSE2-NEXT: por %xmm3, %xmm7 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm12, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm11, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm9, %xmm6 +; SSE2-NEXT: por %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm9, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm9, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm12, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067968,18446744071562067968] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm12, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm14 -; SSE2-NEXT: pcmpeqd %xmm14, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: pcmpgtd %xmm13, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm12, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm13, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm11 +; SSE2-NEXT: por %xmm1, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067968,18446744071562067968] +; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 +; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm12 +; SSE2-NEXT: pand %xmm12, %xmm11 +; SSE2-NEXT: pandn %xmm0, %xmm12 +; SSE2-NEXT: por %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2] -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: pxor %xmm12, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm13, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSE2-NEXT: pxor %xmm10, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm13, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE2-NEXT: pxor %xmm14, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 -; SSE2-NEXT: pxor %xmm14, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm12[0,2] +; SSE2-NEXT: movdqa %xmm2, %xmm11 +; SSE2-NEXT: pxor %xmm8, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,2,2] +; SSE2-NEXT: pand %xmm12, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: pxor %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm8[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm12, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax @@ -122,32 +122,32 @@ ; SSE2-NEXT: movss %xmm1, (%rdi) ; SSE2-NEXT: .LBB0_2: # %else ; SSE2-NEXT: por %xmm11, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm6 +; SSE2-NEXT: por %xmm8, %xmm10 ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB0_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE2-NEXT: movd %xmm0, 4(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; SSE2-NEXT: movd %xmm4, 4(%rdi) ; SSE2-NEXT: .LBB0_4: # %else2 -; SSE2-NEXT: pand %xmm3, %xmm7 -; SSE2-NEXT: pandn %xmm9, %xmm3 -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pandn %xmm9, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pandn %xmm0, %xmm10 ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: .LBB0_6: # %else4 -; SSE2-NEXT: por %xmm3, %xmm7 -; SSE2-NEXT: por %xmm6, %xmm10 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm10, %xmm6 ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB0_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] ; SSE2-NEXT: movd %xmm0, 12(%rdi) ; SSE2-NEXT: .LBB0_8: # %else6 -; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm7[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm2[0,2] ; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: jne .LBB0_9 ; SSE2-NEXT: # %bb.10: # %else8 @@ -162,65 +162,65 @@ ; SSE2-NEXT: .LBB0_16: # %else14 ; SSE2-NEXT: retq ; SSE2-NEXT: .LBB0_9: # %cond.store7 -; SSE2-NEXT: movss %xmm10, 16(%rdi) +; SSE2-NEXT: movss %xmm6, 16(%rdi) ; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB0_12 ; SSE2-NEXT: .LBB0_11: # %cond.store9 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE2-NEXT: movd %xmm0, 20(%rdi) ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB0_14 ; SSE2-NEXT: .LBB0_13: # %cond.store11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE2-NEXT: movd %xmm0, 24(%rdi) ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB0_16 ; SSE2-NEXT: .LBB0_15: # %cond.store13 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; SSE2-NEXT: movd %xmm0, 28(%rdi) ; SSE2-NEXT: retq ; ; SSE4-LABEL: truncstore_v8i64_v8i32: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa %xmm0, %xmm9 -; SSE4-NEXT: pxor %xmm8, %xmm8 -; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647] -; SSE4-NEXT: movdqa %xmm7, %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm6 +; SSE4-NEXT: pxor %xmm7, %xmm7 +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647] +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: movdqa %xmm7, %xmm10 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm10 -; SSE4-NEXT: movdqa %xmm7, %xmm0 +; SSE4-NEXT: movdqa %xmm9, %xmm8 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movdqa %xmm7, %xmm6 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6 -; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 -; SSE4-NEXT: movdqa %xmm7, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm3 -; SSE4-NEXT: movdqa %xmm7, %xmm0 +; SSE4-NEXT: movdqa %xmm9, %xmm10 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm10 +; SSE4-NEXT: movdqa %xmm9, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 +; SSE4-NEXT: movdqa %xmm9, %xmm3 +; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9 ; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; SSE4-NEXT: movapd %xmm7, %xmm0 +; SSE4-NEXT: movapd %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: movdqa %xmm1, %xmm2 -; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm2 +; SSE4-NEXT: movdqa %xmm1, %xmm6 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm6 ; SSE4-NEXT: movapd %xmm3, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: movdqa %xmm1, %xmm7 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE4-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[0,2] -; SSE4-NEXT: movapd %xmm6, %xmm0 +; SSE4-NEXT: movdqa %xmm1, %xmm2 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE4-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2] +; SSE4-NEXT: movapd %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3 -; SSE4-NEXT: movapd %xmm10, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm3 +; SSE4-NEXT: movapd %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE4-NEXT: pcmpeqd %xmm7, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE4-NEXT: pxor %xmm0, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE4-NEXT: pcmpeqd %xmm7, %xmm4 ; SSE4-NEXT: pxor %xmm0, %xmm4 ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 @@ -237,7 +237,7 @@ ; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB0_8 ; SSE4-NEXT: .LBB0_7: # %cond.store5 -; SSE4-NEXT: extractps $3, %xmm7, 12(%rdi) +; SSE4-NEXT: extractps $3, %xmm2, 12(%rdi) ; SSE4-NEXT: .LBB0_8: # %else6 ; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] ; SSE4-NEXT: testb $16, %al @@ -254,15 +254,15 @@ ; SSE4-NEXT: .LBB0_16: # %else14 ; SSE4-NEXT: retq ; SSE4-NEXT: .LBB0_1: # %cond.store -; SSE4-NEXT: movss %xmm7, (%rdi) +; SSE4-NEXT: movss %xmm2, (%rdi) ; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB0_4 ; SSE4-NEXT: .LBB0_3: # %cond.store1 -; SSE4-NEXT: extractps $1, %xmm7, 4(%rdi) +; SSE4-NEXT: extractps $1, %xmm2, 4(%rdi) ; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB0_6 ; SSE4-NEXT: .LBB0_5: # %cond.store3 -; SSE4-NEXT: extractps $2, %xmm7, 8(%rdi) +; SSE4-NEXT: extractps $2, %xmm2, 8(%rdi) ; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: jne .LBB0_7 ; SSE4-NEXT: jmp .LBB0_8 @@ -383,118 +383,118 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [32767,32767] -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm11, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm12, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm10 -; SSE2-NEXT: pand %xmm10, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm10 -; SSE2-NEXT: por %xmm2, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: pxor %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm10, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm9, %xmm8 +; SSE2-NEXT: por %xmm2, %xmm8 ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm7 -; SSE2-NEXT: por %xmm3, %xmm7 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm12, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm10, %xmm13 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744073709518848,18446744073709518848] -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm13, %xmm13 -; SSE2-NEXT: pcmpeqd %xmm13, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: pcmpgtd %xmm12, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm13, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: packssdw %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm13, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE2-NEXT: por %xmm13, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm7 -; SSE2-NEXT: pandn %xmm9, %xmm1 -; SSE2-NEXT: por %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm13, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm10 ; SSE2-NEXT: pandn %xmm9, %xmm2 -; SSE2-NEXT: por %xmm10, %xmm2 -; SSE2-NEXT: packssdw %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm10 +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm11 +; SSE2-NEXT: por %xmm1, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm12 +; SSE2-NEXT: pand %xmm12, %xmm11 +; SSE2-NEXT: pandn %xmm3, %xmm12 +; SSE2-NEXT: por %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm10 +; SSE2-NEXT: pandn %xmm3, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 +; SSE2-NEXT: packssdw %xmm12, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: pxor %xmm7, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm12, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm2 +; SSE2-NEXT: pandn %xmm3, %xmm10 +; SSE2-NEXT: por %xmm2, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm2, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm8 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm8, %xmm2 +; SSE2-NEXT: packssdw %xmm10, %xmm2 ; SSE2-NEXT: packssdw %xmm2, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE2-NEXT: pxor %xmm13, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 -; SSE2-NEXT: pxor %xmm13, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax @@ -565,47 +565,47 @@ ; ; SSE4-LABEL: truncstore_v8i64_v8i16: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa %xmm0, %xmm9 -; SSE4-NEXT: pxor %xmm8, %xmm8 -; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [32767,32767] -; SSE4-NEXT: movdqa %xmm7, %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm6 +; SSE4-NEXT: pxor %xmm7, %xmm7 +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [32767,32767] +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: movdqa %xmm7, %xmm10 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm10 -; SSE4-NEXT: movdqa %xmm7, %xmm0 +; SSE4-NEXT: movdqa %xmm9, %xmm8 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movdqa %xmm7, %xmm2 +; SSE4-NEXT: movdqa %xmm9, %xmm2 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 -; SSE4-NEXT: movdqa %xmm7, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm3 -; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] -; SSE4-NEXT: movapd %xmm7, %xmm0 +; SSE4-NEXT: movdqa %xmm9, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 +; SSE4-NEXT: movdqa %xmm9, %xmm3 +; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: movdqa %xmm1, %xmm6 -; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm6 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9 +; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [18446744073709518848,18446744073709518848] +; SSE4-NEXT: movapd %xmm9, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 +; SSE4-NEXT: movdqa %xmm6, %xmm10 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm10 ; SSE4-NEXT: movapd %xmm3, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: movdqa %xmm1, %xmm7 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE4-NEXT: packssdw %xmm6, %xmm7 +; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 +; SSE4-NEXT: movdqa %xmm6, %xmm1 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE4-NEXT: packssdw %xmm10, %xmm1 ; SSE4-NEXT: movapd %xmm2, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: movdqa %xmm1, %xmm3 +; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 +; SSE4-NEXT: movdqa %xmm6, %xmm3 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE4-NEXT: movapd %xmm10, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm1 -; SSE4-NEXT: packssdw %xmm3, %xmm1 -; SSE4-NEXT: packssdw %xmm1, %xmm7 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE4-NEXT: movapd %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm6 +; SSE4-NEXT: packssdw %xmm3, %xmm6 +; SSE4-NEXT: packssdw %xmm6, %xmm1 +; SSE4-NEXT: pcmpeqd %xmm7, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE4-NEXT: pxor %xmm0, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE4-NEXT: pcmpeqd %xmm7, %xmm4 ; SSE4-NEXT: pxor %xmm0, %xmm4 ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 @@ -636,35 +636,35 @@ ; SSE4-NEXT: .LBB1_16: # %else14 ; SSE4-NEXT: retq ; SSE4-NEXT: .LBB1_1: # %cond.store -; SSE4-NEXT: pextrw $0, %xmm7, (%rdi) +; SSE4-NEXT: pextrw $0, %xmm1, (%rdi) ; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB1_4 ; SSE4-NEXT: .LBB1_3: # %cond.store1 -; SSE4-NEXT: pextrw $1, %xmm7, 2(%rdi) +; SSE4-NEXT: pextrw $1, %xmm1, 2(%rdi) ; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB1_6 ; SSE4-NEXT: .LBB1_5: # %cond.store3 -; SSE4-NEXT: pextrw $2, %xmm7, 4(%rdi) +; SSE4-NEXT: pextrw $2, %xmm1, 4(%rdi) ; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB1_8 ; SSE4-NEXT: .LBB1_7: # %cond.store5 -; SSE4-NEXT: pextrw $3, %xmm7, 6(%rdi) +; SSE4-NEXT: pextrw $3, %xmm1, 6(%rdi) ; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB1_10 ; SSE4-NEXT: .LBB1_9: # %cond.store7 -; SSE4-NEXT: pextrw $4, %xmm7, 8(%rdi) +; SSE4-NEXT: pextrw $4, %xmm1, 8(%rdi) ; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB1_12 ; SSE4-NEXT: .LBB1_11: # %cond.store9 -; SSE4-NEXT: pextrw $5, %xmm7, 10(%rdi) +; SSE4-NEXT: pextrw $5, %xmm1, 10(%rdi) ; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB1_14 ; SSE4-NEXT: .LBB1_13: # %cond.store11 -; SSE4-NEXT: pextrw $6, %xmm7, 12(%rdi) +; SSE4-NEXT: pextrw $6, %xmm1, 12(%rdi) ; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB1_16 ; SSE4-NEXT: .LBB1_15: # %cond.store13 -; SSE4-NEXT: pextrw $7, %xmm7, 14(%rdi) +; SSE4-NEXT: pextrw $7, %xmm1, 14(%rdi) ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i64_v8i16: @@ -933,119 +933,119 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [127,127] -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm11, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm12, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm10 -; SSE2-NEXT: pand %xmm10, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm10 -; SSE2-NEXT: por %xmm2, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: pxor %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm10, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm9, %xmm8 +; SSE2-NEXT: por %xmm2, %xmm8 ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm7 -; SSE2-NEXT: por %xmm3, %xmm7 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm12, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm10, %xmm13 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744073709551488,18446744073709551488] -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm13, %xmm13 -; SSE2-NEXT: pcmpeqd %xmm13, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm12, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm13, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: packssdw %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm13, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm7 -; SSE2-NEXT: pandn %xmm9, %xmm1 -; SSE2-NEXT: por %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm13, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,2,2] +; SSE2-NEXT: por %xmm13, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm10 ; SSE2-NEXT: pandn %xmm9, %xmm2 -; SSE2-NEXT: por %xmm10, %xmm2 -; SSE2-NEXT: packssdw %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm10 +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm11 +; SSE2-NEXT: por %xmm1, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm12 +; SSE2-NEXT: pand %xmm12, %xmm11 +; SSE2-NEXT: pandn %xmm3, %xmm12 +; SSE2-NEXT: por %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm10 +; SSE2-NEXT: pandn %xmm3, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 +; SSE2-NEXT: packssdw %xmm12, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: pxor %xmm7, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm12, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm2 +; SSE2-NEXT: pandn %xmm3, %xmm10 +; SSE2-NEXT: por %xmm2, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm2, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm8 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm8, %xmm2 +; SSE2-NEXT: packssdw %xmm10, %xmm2 ; SSE2-NEXT: packssdw %xmm2, %xmm0 ; SSE2-NEXT: packsswb %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE2-NEXT: pxor %xmm13, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 -; SSE2-NEXT: pxor %xmm13, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax @@ -1109,48 +1109,48 @@ ; ; SSE4-LABEL: truncstore_v8i64_v8i8: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa %xmm0, %xmm9 -; SSE4-NEXT: pxor %xmm8, %xmm8 -; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [127,127] -; SSE4-NEXT: movdqa %xmm7, %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm6 +; SSE4-NEXT: pxor %xmm7, %xmm7 +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [127,127] +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: movdqa %xmm7, %xmm10 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm10 -; SSE4-NEXT: movdqa %xmm7, %xmm0 +; SSE4-NEXT: movdqa %xmm9, %xmm8 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movdqa %xmm7, %xmm2 +; SSE4-NEXT: movdqa %xmm9, %xmm2 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 -; SSE4-NEXT: movdqa %xmm7, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm3 -; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; SSE4-NEXT: movapd %xmm7, %xmm0 +; SSE4-NEXT: movdqa %xmm9, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 +; SSE4-NEXT: movdqa %xmm9, %xmm3 +; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: movdqa %xmm1, %xmm6 -; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm6 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9 +; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] +; SSE4-NEXT: movapd %xmm9, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 +; SSE4-NEXT: movdqa %xmm6, %xmm10 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm10 ; SSE4-NEXT: movapd %xmm3, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: movdqa %xmm1, %xmm7 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE4-NEXT: packssdw %xmm6, %xmm7 +; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 +; SSE4-NEXT: movdqa %xmm6, %xmm1 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE4-NEXT: packssdw %xmm10, %xmm1 ; SSE4-NEXT: movapd %xmm2, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: movdqa %xmm1, %xmm3 +; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 +; SSE4-NEXT: movdqa %xmm6, %xmm3 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE4-NEXT: movapd %xmm10, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm1 -; SSE4-NEXT: packssdw %xmm3, %xmm1 -; SSE4-NEXT: packssdw %xmm1, %xmm7 -; SSE4-NEXT: packsswb %xmm7, %xmm7 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE4-NEXT: movapd %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm6 +; SSE4-NEXT: packssdw %xmm3, %xmm6 +; SSE4-NEXT: packssdw %xmm6, %xmm1 +; SSE4-NEXT: packsswb %xmm1, %xmm1 +; SSE4-NEXT: pcmpeqd %xmm7, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE4-NEXT: pxor %xmm0, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE4-NEXT: pcmpeqd %xmm7, %xmm4 ; SSE4-NEXT: pxor %xmm0, %xmm4 ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 @@ -1181,35 +1181,35 @@ ; SSE4-NEXT: .LBB2_16: # %else14 ; SSE4-NEXT: retq ; SSE4-NEXT: .LBB2_1: # %cond.store -; SSE4-NEXT: pextrb $0, %xmm7, (%rdi) +; SSE4-NEXT: pextrb $0, %xmm1, (%rdi) ; SSE4-NEXT: testb $2, %al ; SSE4-NEXT: je .LBB2_4 ; SSE4-NEXT: .LBB2_3: # %cond.store1 -; SSE4-NEXT: pextrb $1, %xmm7, 1(%rdi) +; SSE4-NEXT: pextrb $1, %xmm1, 1(%rdi) ; SSE4-NEXT: testb $4, %al ; SSE4-NEXT: je .LBB2_6 ; SSE4-NEXT: .LBB2_5: # %cond.store3 -; SSE4-NEXT: pextrb $2, %xmm7, 2(%rdi) +; SSE4-NEXT: pextrb $2, %xmm1, 2(%rdi) ; SSE4-NEXT: testb $8, %al ; SSE4-NEXT: je .LBB2_8 ; SSE4-NEXT: .LBB2_7: # %cond.store5 -; SSE4-NEXT: pextrb $3, %xmm7, 3(%rdi) +; SSE4-NEXT: pextrb $3, %xmm1, 3(%rdi) ; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: je .LBB2_10 ; SSE4-NEXT: .LBB2_9: # %cond.store7 -; SSE4-NEXT: pextrb $4, %xmm7, 4(%rdi) +; SSE4-NEXT: pextrb $4, %xmm1, 4(%rdi) ; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB2_12 ; SSE4-NEXT: .LBB2_11: # %cond.store9 -; SSE4-NEXT: pextrb $5, %xmm7, 5(%rdi) +; SSE4-NEXT: pextrb $5, %xmm1, 5(%rdi) ; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB2_14 ; SSE4-NEXT: .LBB2_13: # %cond.store11 -; SSE4-NEXT: pextrb $6, %xmm7, 6(%rdi) +; SSE4-NEXT: pextrb $6, %xmm1, 6(%rdi) ; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB2_16 ; SSE4-NEXT: .LBB2_15: # %cond.store13 -; SSE4-NEXT: pextrb $7, %xmm7, 7(%rdi) +; SSE4-NEXT: pextrb $7, %xmm1, 7(%rdi) ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i64_v8i8: @@ -1480,63 +1480,63 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-LABEL: truncstore_v4i64_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm6, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm5 +; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067968,18446744071562067968] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pandn %xmm9, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al @@ -1709,64 +1709,64 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-LABEL: truncstore_v4i64_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [32767,32767] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm6, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm5 +; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744073709518848,18446744073709518848] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pandn %xmm9, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: packssdw %xmm4, %xmm0 +; SSE2-NEXT: packssdw %xmm6, %xmm0 ; SSE2-NEXT: packssdw %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al @@ -2023,68 +2023,68 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-LABEL: truncstore_v4i64_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [127,127] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [127,127] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm6, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm5 +; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744073709551488,18446744073709551488] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: por %xmm0, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm9, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx ; SSE2-NEXT: xorl $15, %ecx ; SSE2-NEXT: testb $1, %cl diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -11,91 +11,91 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [4294967295,4294967295] -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm11, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [9223372039002259455,9223372039002259455] -; SSE2-NEXT: movdqa %xmm12, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm6 -; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm1 +; SSE2-NEXT: pandn %xmm6, %xmm11 +; SSE2-NEXT: por %xmm1, %xmm11 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm11, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm10 -; SSE2-NEXT: movdqa %xmm12, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm13 -; SSE2-NEXT: pand %xmm13, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm13 -; SSE2-NEXT: por %xmm0, %xmm13 -; SSE2-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm6[0,2] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm12, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm9, %xmm12 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm10, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2-NEXT: movdqa %xmm2, %xmm11 +; SSE2-NEXT: pxor %xmm8, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm12, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm11, %xmm11 +; SSE2-NEXT: pxor %xmm11, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm11, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB0_2 ; SSE2-NEXT: # %bb.1: # %cond.store -; SSE2-NEXT: movss %xmm13, (%rdi) +; SSE2-NEXT: movss %xmm1, (%rdi) ; SSE2-NEXT: .LBB0_2: # %else ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: por %xmm9, %xmm8 ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB0_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; SSE2-NEXT: movd %xmm1, 4(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; SSE2-NEXT: movd %xmm4, 4(%rdi) ; SSE2-NEXT: .LBB0_4: # %else2 ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm0 -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm7 +; SSE2-NEXT: pandn %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm8 ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; SSE2-NEXT: movd %xmm1, 8(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; SSE2-NEXT: movd %xmm4, 8(%rdi) ; SSE2-NEXT: .LBB0_6: # %else4 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: por %xmm8, %xmm2 ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB0_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] ; SSE2-NEXT: movd %xmm0, 12(%rdi) ; SSE2-NEXT: .LBB0_8: # %else6 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] @@ -133,34 +133,34 @@ ; ; SSE4-LABEL: truncstore_v8i64_v8i32: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa %xmm0, %xmm8 +; SSE4-NEXT: movdqa %xmm0, %xmm6 ; SSE4-NEXT: pxor %xmm9, %xmm9 -; SSE4-NEXT: movapd {{.*#+}} xmm10 = [4294967295,4294967295] -; SSE4-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: movdqa %xmm1, %xmm6 -; SSE4-NEXT: pxor %xmm11, %xmm6 +; SSE4-NEXT: movapd {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE4-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm1, %xmm11 +; SSE4-NEXT: pxor %xmm10, %xmm11 ; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372041149743103,9223372041149743103] ; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: movapd %xmm10, %xmm6 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; SSE4-NEXT: movdqa %xmm8, %xmm1 -; SSE4-NEXT: pxor %xmm11, %xmm1 +; SSE4-NEXT: pcmpgtq %xmm11, %xmm0 +; SSE4-NEXT: movapd %xmm8, %xmm11 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm11 +; SSE4-NEXT: movdqa %xmm6, %xmm1 +; SSE4-NEXT: pxor %xmm10, %xmm1 ; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: movapd %xmm10, %xmm1 -; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2] +; SSE4-NEXT: movapd %xmm8, %xmm1 +; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2] ; SSE4-NEXT: movdqa %xmm3, %xmm6 -; SSE4-NEXT: pxor %xmm11, %xmm6 +; SSE4-NEXT: pxor %xmm10, %xmm6 ; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: movapd %xmm10, %xmm8 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 -; SSE4-NEXT: pxor %xmm2, %xmm11 -; SSE4-NEXT: pcmpgtq %xmm11, %xmm7 +; SSE4-NEXT: movapd %xmm8, %xmm6 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; SSE4-NEXT: pxor %xmm2, %xmm10 +; SSE4-NEXT: pcmpgtq %xmm10, %xmm7 ; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm10 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8 ; SSE4-NEXT: pcmpeqd %xmm9, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE4-NEXT: pxor %xmm0, %xmm5 @@ -183,7 +183,7 @@ ; SSE4-NEXT: .LBB0_7: # %cond.store5 ; SSE4-NEXT: extractps $3, %xmm1, 12(%rdi) ; SSE4-NEXT: .LBB0_8: # %else6 -; SSE4-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm8[0,2] +; SSE4-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm6[0,2] ; SSE4-NEXT: testb $16, %al ; SSE4-NEXT: jne .LBB0_9 ; SSE4-NEXT: # %bb.10: # %else8 @@ -211,19 +211,19 @@ ; SSE4-NEXT: jne .LBB0_7 ; SSE4-NEXT: jmp .LBB0_8 ; SSE4-NEXT: .LBB0_9: # %cond.store7 -; SSE4-NEXT: movss %xmm10, 16(%rdi) +; SSE4-NEXT: movss %xmm8, 16(%rdi) ; SSE4-NEXT: testb $32, %al ; SSE4-NEXT: je .LBB0_12 ; SSE4-NEXT: .LBB0_11: # %cond.store9 -; SSE4-NEXT: extractps $1, %xmm10, 20(%rdi) +; SSE4-NEXT: extractps $1, %xmm8, 20(%rdi) ; SSE4-NEXT: testb $64, %al ; SSE4-NEXT: je .LBB0_14 ; SSE4-NEXT: .LBB0_13: # %cond.store11 -; SSE4-NEXT: extractps $2, %xmm10, 24(%rdi) +; SSE4-NEXT: extractps $2, %xmm8, 24(%rdi) ; SSE4-NEXT: testb $-128, %al ; SSE4-NEXT: je .LBB0_16 ; SSE4-NEXT: .LBB0_15: # %cond.store13 -; SSE4-NEXT: extractps $3, %xmm10, 28(%rdi) +; SSE4-NEXT: extractps $3, %xmm8, 28(%rdi) ; SSE4-NEXT: retq ; ; AVX1-LABEL: truncstore_v8i64_v8i32: @@ -235,7 +235,7 @@ ; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm8 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103] @@ -246,17 +246,17 @@ ; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm7 ; AVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm6, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm5, %xmm8 +; AVX1-NEXT: vblendvpd %xmm8, %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; AVX1-NEXT: vmaskmovps %ymm0, %ymm8, (%rdi) +; AVX1-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -318,76 +318,76 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm12 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002324991,9223372039002324991] -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm12 -; SSE2-NEXT: por %xmm2, %xmm12 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: pxor %xmm9, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm8[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm2, %xmm8 ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm0 +; SSE2-NEXT: pandn %xmm7, %xmm11 +; SSE2-NEXT: por %xmm0, %xmm11 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm7, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 @@ -459,36 +459,36 @@ ; ; SSE4-LABEL: truncstore_v8i64_v8i16: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa %xmm0, %xmm9 +; SSE4-NEXT: movdqa %xmm0, %xmm6 ; SSE4-NEXT: pxor %xmm8, %xmm8 -; SSE4-NEXT: movapd {{.*#+}} xmm6 = [65535,65535] +; SSE4-NEXT: movapd {{.*#+}} xmm9 = [65535,65535] ; SSE4-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: movdqa %xmm1, %xmm7 -; SSE4-NEXT: pxor %xmm10, %xmm7 -; SSE4-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854841343,9223372036854841343] -; SSE4-NEXT: movdqa %xmm11, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm7, %xmm0 -; SSE4-NEXT: movapd %xmm6, %xmm7 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE4-NEXT: movdqa %xmm9, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm11 +; SSE4-NEXT: pxor %xmm10, %xmm11 +; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854841343,9223372036854841343] +; SSE4-NEXT: movdqa %xmm7, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm11, %xmm0 +; SSE4-NEXT: movapd %xmm9, %xmm11 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm11 +; SSE4-NEXT: movdqa %xmm6, %xmm1 ; SSE4-NEXT: pxor %xmm10, %xmm1 -; SSE4-NEXT: movdqa %xmm11, %xmm0 +; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: movapd %xmm6, %xmm1 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm1 -; SSE4-NEXT: packusdw %xmm7, %xmm1 -; SSE4-NEXT: movdqa %xmm3, %xmm7 -; SSE4-NEXT: pxor %xmm10, %xmm7 -; SSE4-NEXT: movdqa %xmm11, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm7, %xmm0 -; SSE4-NEXT: movapd %xmm6, %xmm7 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE4-NEXT: movapd %xmm9, %xmm1 +; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE4-NEXT: packusdw %xmm11, %xmm1 +; SSE4-NEXT: movdqa %xmm3, %xmm6 +; SSE4-NEXT: pxor %xmm10, %xmm6 +; SSE4-NEXT: movdqa %xmm7, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 +; SSE4-NEXT: movapd %xmm9, %xmm6 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6 ; SSE4-NEXT: pxor %xmm2, %xmm10 -; SSE4-NEXT: pcmpgtq %xmm10, %xmm11 -; SSE4-NEXT: movdqa %xmm11, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 -; SSE4-NEXT: packusdw %xmm7, %xmm6 -; SSE4-NEXT: packusdw %xmm6, %xmm1 +; SSE4-NEXT: pcmpgtq %xmm10, %xmm7 +; SSE4-NEXT: movdqa %xmm7, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9 +; SSE4-NEXT: packusdw %xmm6, %xmm9 +; SSE4-NEXT: packusdw %xmm9, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE4-NEXT: pxor %xmm0, %xmm5 @@ -812,75 +812,75 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm12 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259711,9223372039002259711] -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm7 -; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm1 +; SSE2-NEXT: pandn %xmm7, %xmm11 +; SSE2-NEXT: por %xmm1, %xmm11 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm12 -; SSE2-NEXT: movdqa %xmm11, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm6 -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: packuswb %xmm7, %xmm6 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm10, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] +; SSE2-NEXT: pandn %xmm7, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: packuswb %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm9, %xmm11 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm10 +; SSE2-NEXT: por %xmm3, %xmm10 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm3 ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: packuswb %xmm1, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm6 -; SSE2-NEXT: packuswb %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2-NEXT: packuswb %xmm10, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm0, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm6, %ecx +; SSE2-NEXT: movd %xmm1, %ecx ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -896,7 +896,7 @@ ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB2_8: # %else6 ; SSE2-NEXT: testb $16, %al -; SSE2-NEXT: pextrw $2, %xmm6, %ecx +; SSE2-NEXT: pextrw $2, %xmm1, %ecx ; SSE2-NEXT: je .LBB2_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -907,7 +907,7 @@ ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB2_12: # %else10 ; SSE2-NEXT: testb $64, %al -; SSE2-NEXT: pextrw $3, %xmm6, %ecx +; SSE2-NEXT: pextrw $3, %xmm1, %ecx ; SSE2-NEXT: jne .LBB2_13 ; SSE2-NEXT: # %bb.14: # %else12 ; SSE2-NEXT: testb $-128, %al @@ -939,36 +939,36 @@ ; ; SSE4-LABEL: truncstore_v8i64_v8i8: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa %xmm0, %xmm9 +; SSE4-NEXT: movdqa %xmm0, %xmm6 ; SSE4-NEXT: pxor %xmm8, %xmm8 -; SSE4-NEXT: movapd {{.*#+}} xmm6 = [255,255] +; SSE4-NEXT: movapd {{.*#+}} xmm9 = [255,255] ; SSE4-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: movdqa %xmm1, %xmm7 -; SSE4-NEXT: pxor %xmm10, %xmm7 -; SSE4-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854776063,9223372036854776063] -; SSE4-NEXT: movdqa %xmm11, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm7, %xmm0 -; SSE4-NEXT: movapd %xmm6, %xmm7 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE4-NEXT: movdqa %xmm9, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm11 +; SSE4-NEXT: pxor %xmm10, %xmm11 +; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854776063,9223372036854776063] +; SSE4-NEXT: movdqa %xmm7, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm11, %xmm0 +; SSE4-NEXT: movapd %xmm9, %xmm11 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm11 +; SSE4-NEXT: movdqa %xmm6, %xmm1 ; SSE4-NEXT: pxor %xmm10, %xmm1 -; SSE4-NEXT: movdqa %xmm11, %xmm0 +; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: movapd %xmm6, %xmm1 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm1 -; SSE4-NEXT: packusdw %xmm7, %xmm1 -; SSE4-NEXT: movdqa %xmm3, %xmm7 -; SSE4-NEXT: pxor %xmm10, %xmm7 -; SSE4-NEXT: movdqa %xmm11, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm7, %xmm0 -; SSE4-NEXT: movapd %xmm6, %xmm7 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE4-NEXT: movapd %xmm9, %xmm1 +; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE4-NEXT: packusdw %xmm11, %xmm1 +; SSE4-NEXT: movdqa %xmm3, %xmm6 +; SSE4-NEXT: pxor %xmm10, %xmm6 +; SSE4-NEXT: movdqa %xmm7, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 +; SSE4-NEXT: movapd %xmm9, %xmm6 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6 ; SSE4-NEXT: pxor %xmm2, %xmm10 -; SSE4-NEXT: pcmpgtq %xmm10, %xmm11 -; SSE4-NEXT: movdqa %xmm11, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 -; SSE4-NEXT: packusdw %xmm7, %xmm6 -; SSE4-NEXT: packusdw %xmm6, %xmm1 +; SSE4-NEXT: pcmpgtq %xmm10, %xmm7 +; SSE4-NEXT: movdqa %xmm7, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9 +; SSE4-NEXT: packusdw %xmm6, %xmm9 +; SSE4-NEXT: packusdw %xmm9, %xmm1 ; SSE4-NEXT: packuswb %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 @@ -1296,35 +1296,35 @@ ; SSE2-LABEL: truncstore_v4i64_v4i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm9, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,2,2] ; SSE2-NEXT: pand %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: por %xmm1, %xmm7 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %eax ; SSE2-NEXT: xorl $15, %eax @@ -1362,22 +1362,22 @@ ; ; SSE4-LABEL: truncstore_v4i64_v4i32: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa %xmm0, %xmm8 +; SSE4-NEXT: movdqa %xmm0, %xmm3 ; SSE4-NEXT: pxor %xmm6, %xmm6 ; SSE4-NEXT: movapd {{.*#+}} xmm5 = [4294967295,4294967295] ; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: movdqa %xmm1, %xmm3 -; SSE4-NEXT: pxor %xmm7, %xmm3 +; SSE4-NEXT: movdqa %xmm1, %xmm8 +; SSE4-NEXT: pxor %xmm7, %xmm8 ; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movapd %xmm5, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE4-NEXT: pxor %xmm8, %xmm7 +; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 +; SSE4-NEXT: movapd %xmm5, %xmm8 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE4-NEXT: pxor %xmm3, %xmm7 ; SSE4-NEXT: pcmpgtq %xmm7, %xmm4 ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE4-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2] +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE4-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm8[0,2] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm6 ; SSE4-NEXT: movmskps %xmm6, %eax ; SSE4-NEXT: xorl $15, %eax @@ -1492,37 +1492,37 @@ ; SSE2-LABEL: truncstore_v4i64_v4i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm9, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,2,2] ; SSE2-NEXT: pand %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: por %xmm0, %xmm7 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm9, %xmm0 +; SSE2-NEXT: pxor %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] ; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm5 ; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 @@ -1563,22 +1563,22 @@ ; ; SSE4-LABEL: truncstore_v4i64_v4i16: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa %xmm0, %xmm8 +; SSE4-NEXT: movdqa %xmm0, %xmm3 ; SSE4-NEXT: pxor %xmm6, %xmm6 ; SSE4-NEXT: movapd {{.*#+}} xmm5 = [65535,65535] ; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: movdqa %xmm1, %xmm3 -; SSE4-NEXT: pxor %xmm7, %xmm3 +; SSE4-NEXT: movdqa %xmm1, %xmm8 +; SSE4-NEXT: pxor %xmm7, %xmm8 ; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343] ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movapd %xmm5, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE4-NEXT: pxor %xmm8, %xmm7 +; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 +; SSE4-NEXT: movapd %xmm5, %xmm8 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE4-NEXT: pxor %xmm3, %xmm7 ; SSE4-NEXT: pcmpgtq %xmm7, %xmm4 ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE4-NEXT: packusdw %xmm3, %xmm5 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE4-NEXT: packusdw %xmm8, %xmm5 ; SSE4-NEXT: packusdw %xmm5, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm6 ; SSE4-NEXT: movmskps %xmm6, %eax @@ -1775,42 +1775,42 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-LABEL: truncstore_v4i64_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm5, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: packuswb %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm6 +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: packuswb %xmm6, %xmm4 ; SSE2-NEXT: packuswb %xmm4, %xmm4 ; SSE2-NEXT: packuswb %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm10 -; SSE2-NEXT: movmskps %xmm10, %ecx +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %ecx ; SSE2-NEXT: xorl $15, %ecx ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm4, %eax @@ -1848,26 +1848,26 @@ ; SSE4-LABEL: truncstore_v4i64_v4i8: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm3 -; SSE4-NEXT: pxor %xmm8, %xmm8 +; SSE4-NEXT: pxor %xmm6, %xmm6 ; SSE4-NEXT: movapd {{.*#+}} xmm7 = [255,255] -; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: movdqa %xmm0, %xmm5 -; SSE4-NEXT: pxor %xmm6, %xmm5 +; SSE4-NEXT: pxor %xmm8, %xmm5 ; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063] ; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm5, %xmm0 ; SSE4-NEXT: movapd %xmm7, %xmm5 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm5 -; SSE4-NEXT: pxor %xmm1, %xmm6 -; SSE4-NEXT: pcmpgtq %xmm6, %xmm4 +; SSE4-NEXT: pxor %xmm1, %xmm8 +; SSE4-NEXT: pcmpgtq %xmm8, %xmm4 ; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7 ; SSE4-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; SSE4-NEXT: pshufb %xmm0, %xmm7 ; SSE4-NEXT: pshufb %xmm0, %xmm5 ; SSE4-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE4-NEXT: pcmpeqd %xmm2, %xmm8 -; SSE4-NEXT: movmskps %xmm8, %eax +; SSE4-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE4-NEXT: movmskps %xmm6, %eax ; SSE4-NEXT: xorl $15, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB5_1 @@ -2511,18 +2511,18 @@ ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pxor %xmm9, %xmm8 ; SSE2-NEXT: por %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: pxor %xmm11, %xmm13 -; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm13, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm10, %xmm13 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE2-NEXT: pand %xmm13, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm13 +; SSE2-NEXT: por %xmm1, %xmm13 +; SSE2-NEXT: pslld $16, %xmm13 +; SSE2-NEXT: psrad $16, %xmm13 ; SSE2-NEXT: pslld $16, %xmm8 ; SSE2-NEXT: psrad $16, %xmm8 -; SSE2-NEXT: packssdw %xmm0, %xmm8 +; SSE2-NEXT: packssdw %xmm13, %xmm8 ; SSE2-NEXT: pcmpeqd %xmm12, %xmm7 ; SSE2-NEXT: pxor %xmm9, %xmm7 ; SSE2-NEXT: pcmpeqd %xmm12, %xmm6 @@ -3231,28 +3231,28 @@ ; SSE2-NEXT: pand %xmm13, %xmm1 ; SSE2-NEXT: pandn %xmm10, %xmm13 ; SSE2-NEXT: por %xmm1, %xmm13 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm0 -; SSE2-NEXT: pandn %xmm10, %xmm12 -; SSE2-NEXT: por %xmm0, %xmm12 -; SSE2-NEXT: packuswb %xmm13, %xmm12 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm12 +; SSE2-NEXT: pxor %xmm11, %xmm12 ; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm12, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm10, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm13, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE2-NEXT: pand %xmm12, %xmm3 +; SSE2-NEXT: pandn %xmm10, %xmm12 +; SSE2-NEXT: por %xmm3, %xmm12 ; SSE2-NEXT: pxor %xmm2, %xmm11 ; SSE2-NEXT: pcmpgtd %xmm11, %xmm9 ; SSE2-NEXT: pand %xmm9, %xmm2 ; SSE2-NEXT: pandn %xmm10, %xmm9 ; SSE2-NEXT: por %xmm2, %xmm9 -; SSE2-NEXT: packuswb %xmm1, %xmm9 -; SSE2-NEXT: packuswb %xmm9, %xmm12 +; SSE2-NEXT: packuswb %xmm12, %xmm9 +; SSE2-NEXT: packuswb %xmm9, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm7 @@ -3267,7 +3267,7 @@ ; SSE2-NEXT: packsswb %xmm6, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm12, %ecx +; SSE2-NEXT: movd %xmm1, %ecx ; SSE2-NEXT: jne .LBB10_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -3283,7 +3283,7 @@ ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB10_8: # %else6 ; SSE2-NEXT: testb $16, %al -; SSE2-NEXT: pextrw $2, %xmm12, %ecx +; SSE2-NEXT: pextrw $2, %xmm1, %ecx ; SSE2-NEXT: je .LBB10_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -3294,7 +3294,7 @@ ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB10_12: # %else10 ; SSE2-NEXT: testb $64, %al -; SSE2-NEXT: pextrw $3, %xmm12, %ecx +; SSE2-NEXT: pextrw $3, %xmm1, %ecx ; SSE2-NEXT: je .LBB10_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) @@ -3305,7 +3305,7 @@ ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB10_16: # %else14 ; SSE2-NEXT: testl $256, %eax # imm = 0x100 -; SSE2-NEXT: pextrw $4, %xmm12, %ecx +; SSE2-NEXT: pextrw $4, %xmm1, %ecx ; SSE2-NEXT: je .LBB10_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) @@ -3316,7 +3316,7 @@ ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB10_20: # %else18 ; SSE2-NEXT: testl $1024, %eax # imm = 0x400 -; SSE2-NEXT: pextrw $5, %xmm12, %ecx +; SSE2-NEXT: pextrw $5, %xmm1, %ecx ; SSE2-NEXT: je .LBB10_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) @@ -3327,7 +3327,7 @@ ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB10_24: # %else22 ; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 -; SSE2-NEXT: pextrw $6, %xmm12, %ecx +; SSE2-NEXT: pextrw $6, %xmm1, %ecx ; SSE2-NEXT: je .LBB10_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) @@ -3338,7 +3338,7 @@ ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB10_28: # %else26 ; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 -; SSE2-NEXT: pextrw $7, %xmm12, %ecx +; SSE2-NEXT: pextrw $7, %xmm1, %ecx ; SSE2-NEXT: jne .LBB10_29 ; SSE2-NEXT: # %bb.30: # %else28 ; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 @@ -3929,30 +3929,30 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i32_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: pxor %xmm7, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147549183,2147549183,2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm6, %xmm5 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pxor %xmm6, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm8 +; SSE2-NEXT: por %xmm1, %xmm8 +; SSE2-NEXT: pslld $16, %xmm8 +; SSE2-NEXT: psrad $16, %xmm8 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: packssdw %xmm4, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE2-NEXT: packssdw %xmm8, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm6, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE2-NEXT: pxor %xmm6, %xmm2 ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: packsswb %xmm2, %xmm2 @@ -3983,42 +3983,42 @@ ; SSE2-NEXT: .LBB11_16: # %else14 ; SSE2-NEXT: retq ; SSE2-NEXT: .LBB11_1: # %cond.store -; SSE2-NEXT: movd %xmm5, %ecx +; SSE2-NEXT: movd %xmm4, %ecx ; SSE2-NEXT: movw %cx, (%rdi) ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB11_4 ; SSE2-NEXT: .LBB11_3: # %cond.store1 -; SSE2-NEXT: pextrw $1, %xmm5, %ecx +; SSE2-NEXT: pextrw $1, %xmm4, %ecx ; SSE2-NEXT: movw %cx, 2(%rdi) ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB11_6 ; SSE2-NEXT: .LBB11_5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm5, %ecx +; SSE2-NEXT: pextrw $2, %xmm4, %ecx ; SSE2-NEXT: movw %cx, 4(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB11_8 ; SSE2-NEXT: .LBB11_7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm5, %ecx +; SSE2-NEXT: pextrw $3, %xmm4, %ecx ; SSE2-NEXT: movw %cx, 6(%rdi) ; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB11_10 ; SSE2-NEXT: .LBB11_9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm5, %ecx +; SSE2-NEXT: pextrw $4, %xmm4, %ecx ; SSE2-NEXT: movw %cx, 8(%rdi) ; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB11_12 ; SSE2-NEXT: .LBB11_11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm5, %ecx +; SSE2-NEXT: pextrw $5, %xmm4, %ecx ; SSE2-NEXT: movw %cx, 10(%rdi) ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB11_14 ; SSE2-NEXT: .LBB11_13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm5, %ecx +; SSE2-NEXT: pextrw $6, %xmm4, %ecx ; SSE2-NEXT: movw %cx, 12(%rdi) ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB11_16 ; SSE2-NEXT: .LBB11_15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm5, %eax +; SSE2-NEXT: pextrw $7, %xmm4, %eax ; SSE2-NEXT: movw %ax, 14(%rdi) ; SSE2-NEXT: retq ; @@ -4332,34 +4332,34 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i32_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm7, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm7, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm6, %xmm9 +; SSE2-NEXT: por %xmm1, %xmm9 ; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm6 -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: packuswb %xmm4, %xmm6 -; SSE2-NEXT: packuswb %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: packuswb %xmm9, %xmm4 +; SSE2-NEXT: packuswb %xmm4, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm6, %ecx +; SSE2-NEXT: movd %xmm4, %ecx ; SSE2-NEXT: jne .LBB12_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -4375,7 +4375,7 @@ ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB12_8: # %else6 ; SSE2-NEXT: testb $16, %al -; SSE2-NEXT: pextrw $2, %xmm6, %ecx +; SSE2-NEXT: pextrw $2, %xmm4, %ecx ; SSE2-NEXT: je .LBB12_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -4386,7 +4386,7 @@ ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB12_12: # %else10 ; SSE2-NEXT: testb $64, %al -; SSE2-NEXT: pextrw $3, %xmm6, %ecx +; SSE2-NEXT: pextrw $3, %xmm4, %ecx ; SSE2-NEXT: jne .LBB12_13 ; SSE2-NEXT: # %bb.14: # %else12 ; SSE2-NEXT: testb $-128, %al @@ -5185,13 +5185,13 @@ ; SSE2-LABEL: truncstore_v32i16_v32i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: psubusw %xmm8, %xmm6 -; SSE2-NEXT: psubw %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psubusw %xmm8, %xmm6 -; SSE2-NEXT: psubw %xmm6, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: psubusw %xmm6, %xmm8 +; SSE2-NEXT: psubw %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: psubusw %xmm6, %xmm8 +; SSE2-NEXT: psubw %xmm8, %xmm0 ; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqb %xmm7, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %ecx @@ -5268,9 +5268,9 @@ ; SSE2-NEXT: movb %cl, 12(%rdi) ; SSE2-NEXT: .LBB15_26: # %else24 ; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: psubusw %xmm8, %xmm1 +; SSE2-NEXT: psubusw %xmm6, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psubusw %xmm8, %xmm4 +; SSE2-NEXT: psubusw %xmm6, %xmm4 ; SSE2-NEXT: testl $8192, %eax # imm = 0x2000 ; SSE2-NEXT: je .LBB15_28 ; SSE2-NEXT: # %bb.27: # %cond.store25 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -485,25 +485,25 @@ ; AVX1-FALLBACK-NEXT: vpsubq %xmm6, %xmm3, %xmm3 ; AVX1-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpsubq %xmm7, %xmm1, %xmm1 -; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm3, %xmm9 +; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm3, %xmm6 ; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm7 ; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] ; AVX1-FALLBACK-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm2, %xmm6 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm7, %xmm6 -; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm6, %xmm1 +; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm2, %xmm9 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 +; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm9, %xmm1 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm2, %xmm7, %xmm2 ; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm3, %xmm3 ; AVX1-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm3, %xmm3 -; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm6 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm9, %xmm6 -; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm6, %xmm3 +; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 +; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm7, %xmm3 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm9, %xmm5 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 ; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm3, %xmm3 ; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm5, %xmm3 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 @@ -536,12 +536,12 @@ ; XOP-FALLBACK-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 ; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm1, %xmm3 ; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm4 -; XOP-FALLBACK-NEXT: vpcomgtq %xmm3, %xmm4, %xmm9 +; XOP-FALLBACK-NEXT: vpcomgtq %xmm3, %xmm4, %xmm5 ; XOP-FALLBACK-NEXT: vpcomltq %xmm3, %xmm4, %xmm6 ; XOP-FALLBACK-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm6 ; XOP-FALLBACK-NEXT: vpcomltq %xmm1, %xmm0, %xmm7 ; XOP-FALLBACK-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7 -; XOP-FALLBACK-NEXT: vblendvpd %xmm9, %xmm4, %xmm3, %xmm3 +; XOP-FALLBACK-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpsubq %xmm6, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpsubq %xmm7, %xmm1, %xmm1 @@ -551,13 +551,13 @@ ; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] ; XOP-FALLBACK-NEXT: vpor %xmm2, %xmm8, %xmm2 ; XOP-FALLBACK-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm2, %xmm5 -; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm2, %xmm9 +; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 +; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm9, %xmm1 ; XOP-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpmuludq %xmm2, %xmm7, %xmm2 ; XOP-FALLBACK-NEXT: vpsrlq $33, %xmm3, %xmm3 -; XOP-FALLBACK-NEXT: vpor %xmm8, %xmm9, %xmm5 +; XOP-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7 ; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 @@ -576,12 +576,12 @@ ; XOPAVX1-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; XOPAVX1-NEXT: vpcomgtq %xmm3, %xmm4, %xmm9 +; XOPAVX1-NEXT: vpcomgtq %xmm3, %xmm4, %xmm5 ; XOPAVX1-NEXT: vpcomltq %xmm3, %xmm4, %xmm6 ; XOPAVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm6 ; XOPAVX1-NEXT: vpcomltq %xmm1, %xmm0, %xmm7 ; XOPAVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7 -; XOPAVX1-NEXT: vblendvpd %xmm9, %xmm4, %xmm3, %xmm3 +; XOPAVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsubq %xmm6, %xmm3, %xmm3 ; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubq %xmm7, %xmm1, %xmm1 @@ -591,13 +591,13 @@ ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] ; XOPAVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 ; XOPAVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 -; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOPAVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; XOPAVX1-NEXT: vpsrlq $32, %xmm2, %xmm9 +; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 +; XOPAVX1-NEXT: vpaddq %xmm1, %xmm9, %xmm1 ; XOPAVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpmuludq %xmm2, %xmm7, %xmm2 ; XOPAVX1-NEXT: vpsrlq $33, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpor %xmm8, %xmm9, %xmm5 +; XOPAVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsrlq $32, %xmm5, %xmm7 ; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 @@ -685,40 +685,40 @@ ; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX1-FALLBACK-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-FALLBACK-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8 +; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 ; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm1, %xmm6 ; AVX1-FALLBACK-NEXT: vpxor %xmm2, %xmm6, %xmm7 -; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-FALLBACK-NEXT: vpxor %xmm2, %xmm5, %xmm2 +; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm8 +; AVX1-FALLBACK-NEXT: vpxor %xmm2, %xmm8, %xmm2 ; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm9 ; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm2 -; AVX1-FALLBACK-NEXT: vblendvpd %xmm2, %xmm5, %xmm6, %xmm2 +; AVX1-FALLBACK-NEXT: vblendvpd %xmm2, %xmm8, %xmm6, %xmm2 ; AVX1-FALLBACK-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 ; AVX1-FALLBACK-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm3 -; AVX1-FALLBACK-NEXT: vblendvpd %xmm9, %xmm5, %xmm6, %xmm4 +; AVX1-FALLBACK-NEXT: vblendvpd %xmm9, %xmm8, %xmm6, %xmm4 ; AVX1-FALLBACK-NEXT: vpsubq %xmm2, %xmm4, %xmm2 -; AVX1-FALLBACK-NEXT: vblendvpd %xmm8, %xmm0, %xmm1, %xmm1 +; AVX1-FALLBACK-NEXT: vblendvpd %xmm5, %xmm0, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpsubq %xmm3, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm2, %xmm3 ; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm4 ; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm10 = [1,1] -; AVX1-FALLBACK-NEXT: vpor %xmm10, %xmm8, %xmm7 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm1, %xmm1 -; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm7, %xmm6 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm4, %xmm6 -; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm6, %xmm1 +; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1] +; AVX1-FALLBACK-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 +; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm4, %xmm7 +; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm7, %xmm1 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm4, %xmm4 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm4, %xmm4 ; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpor %xmm10, %xmm9, %xmm6 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm6, %xmm7 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm3, %xmm7 -; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm7, %xmm2 +; AVX1-FALLBACK-NEXT: vpor %xmm6, %xmm9, %xmm5 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm2, %xmm2 +; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm6 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm3, %xmm6 +; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm6, %xmm2 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm3, %xmm3 -; AVX1-FALLBACK-NEXT: vpaddq %xmm5, %xmm2, %xmm2 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm3, %xmm3 +; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm8, %xmm2 ; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 @@ -754,12 +754,12 @@ ; XOP-FALLBACK-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm2 ; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm1, %xmm3 ; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm4 -; XOP-FALLBACK-NEXT: vpcomgtuq %xmm3, %xmm4, %xmm9 +; XOP-FALLBACK-NEXT: vpcomgtuq %xmm3, %xmm4, %xmm5 ; XOP-FALLBACK-NEXT: vpcomltuq %xmm3, %xmm4, %xmm6 ; XOP-FALLBACK-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm6 ; XOP-FALLBACK-NEXT: vpcomltuq %xmm1, %xmm0, %xmm7 ; XOP-FALLBACK-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7 -; XOP-FALLBACK-NEXT: vblendvpd %xmm9, %xmm4, %xmm3, %xmm3 +; XOP-FALLBACK-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpsubq %xmm6, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpsubq %xmm7, %xmm1, %xmm1 @@ -769,13 +769,13 @@ ; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] ; XOP-FALLBACK-NEXT: vpor %xmm2, %xmm8, %xmm2 ; XOP-FALLBACK-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm2, %xmm5 -; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm2, %xmm9 +; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 +; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm9, %xmm1 ; XOP-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpmuludq %xmm2, %xmm7, %xmm2 ; XOP-FALLBACK-NEXT: vpsrlq $33, %xmm3, %xmm3 -; XOP-FALLBACK-NEXT: vpor %xmm8, %xmm9, %xmm5 +; XOP-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7 ; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 @@ -794,12 +794,12 @@ ; XOPAVX1-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; XOPAVX1-NEXT: vpcomgtuq %xmm3, %xmm4, %xmm9 +; XOPAVX1-NEXT: vpcomgtuq %xmm3, %xmm4, %xmm5 ; XOPAVX1-NEXT: vpcomltuq %xmm3, %xmm4, %xmm6 ; XOPAVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm6 ; XOPAVX1-NEXT: vpcomltuq %xmm1, %xmm0, %xmm7 ; XOPAVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7 -; XOPAVX1-NEXT: vblendvpd %xmm9, %xmm4, %xmm3, %xmm3 +; XOPAVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsubq %xmm6, %xmm3, %xmm3 ; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubq %xmm7, %xmm1, %xmm1 @@ -809,13 +809,13 @@ ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] ; XOPAVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 ; XOPAVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 -; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOPAVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; XOPAVX1-NEXT: vpsrlq $32, %xmm2, %xmm9 +; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 +; XOPAVX1-NEXT: vpaddq %xmm1, %xmm9, %xmm1 ; XOPAVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpmuludq %xmm2, %xmm7, %xmm2 ; XOPAVX1-NEXT: vpsrlq $33, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpor %xmm8, %xmm9, %xmm5 +; XOPAVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsrlq $32, %xmm5, %xmm7 ; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 @@ -913,25 +913,25 @@ ; AVX1-FALLBACK-NEXT: vpsubq %xmm6, %xmm4, %xmm4 ; AVX1-FALLBACK-NEXT: vblendvpd %xmm3, %xmm1, %xmm0, %xmm0 ; AVX1-FALLBACK-NEXT: vpsubq %xmm7, %xmm0, %xmm0 -; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm4, %xmm9 +; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm4, %xmm6 ; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm0, %xmm7 ; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm0, %xmm0 ; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] ; AVX1-FALLBACK-NEXT: vpor %xmm3, %xmm8, %xmm3 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 -; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm6 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm7, %xmm6 -; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm6, %xmm0 +; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm9 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 +; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm9, %xmm0 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm0, %xmm0 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm7, %xmm3 ; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm4, %xmm4 ; AVX1-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm4, %xmm4 -; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm6 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm9, %xmm6 -; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm6, %xmm4 +; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 +; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm7, %xmm4 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm9, %xmm5 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 ; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm5, %xmm2 ; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 @@ -966,12 +966,12 @@ ; XOP-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm2 ; XOP-FALLBACK-NEXT: vpcomgtq %xmm0, %xmm1, %xmm3 ; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm4 -; XOP-FALLBACK-NEXT: vpcomgtq %xmm4, %xmm2, %xmm9 +; XOP-FALLBACK-NEXT: vpcomgtq %xmm4, %xmm2, %xmm5 ; XOP-FALLBACK-NEXT: vpcomltq %xmm4, %xmm2, %xmm6 ; XOP-FALLBACK-NEXT: vblendvpd %xmm6, %xmm2, %xmm4, %xmm6 ; XOP-FALLBACK-NEXT: vpcomltq %xmm0, %xmm1, %xmm7 ; XOP-FALLBACK-NEXT: vblendvpd %xmm7, %xmm1, %xmm0, %xmm7 -; XOP-FALLBACK-NEXT: vblendvpd %xmm9, %xmm2, %xmm4, %xmm4 +; XOP-FALLBACK-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm4 ; XOP-FALLBACK-NEXT: vpsubq %xmm6, %xmm4, %xmm4 ; XOP-FALLBACK-NEXT: vblendvpd %xmm3, %xmm1, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vpsubq %xmm7, %xmm0, %xmm0 @@ -981,13 +981,13 @@ ; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] ; XOP-FALLBACK-NEXT: vpor %xmm3, %xmm8, %xmm3 ; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 -; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm5 -; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm5, %xmm0 +; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm9 +; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 +; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm9, %xmm0 ; XOP-FALLBACK-NEXT: vpsllq $32, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm7, %xmm3 ; XOP-FALLBACK-NEXT: vpsrlq $33, %xmm4, %xmm4 -; XOP-FALLBACK-NEXT: vpor %xmm8, %xmm9, %xmm5 +; XOP-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm4, %xmm4 ; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7 ; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 @@ -1007,12 +1007,12 @@ ; XOPAVX1-NEXT: vmovdqa 16(%rdi), %xmm2 ; XOPAVX1-NEXT: vpcomgtq %xmm0, %xmm1, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; XOPAVX1-NEXT: vpcomgtq %xmm4, %xmm2, %xmm9 +; XOPAVX1-NEXT: vpcomgtq %xmm4, %xmm2, %xmm5 ; XOPAVX1-NEXT: vpcomltq %xmm4, %xmm2, %xmm6 ; XOPAVX1-NEXT: vblendvpd %xmm6, %xmm2, %xmm4, %xmm6 ; XOPAVX1-NEXT: vpcomltq %xmm0, %xmm1, %xmm7 ; XOPAVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm0, %xmm7 -; XOPAVX1-NEXT: vblendvpd %xmm9, %xmm2, %xmm4, %xmm4 +; XOPAVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpsubq %xmm6, %xmm4, %xmm4 ; XOPAVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpsubq %xmm7, %xmm0, %xmm0 @@ -1022,13 +1022,13 @@ ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] ; XOPAVX1-NEXT: vpor %xmm3, %xmm8, %xmm3 ; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 -; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOPAVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 +; XOPAVX1-NEXT: vpsrlq $32, %xmm3, %xmm9 +; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm9, %xmm0 ; XOPAVX1-NEXT: vpsllq $32, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm7, %xmm3 ; XOPAVX1-NEXT: vpsrlq $33, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpor %xmm8, %xmm9, %xmm5 +; XOPAVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpsrlq $32, %xmm5, %xmm7 ; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 @@ -1126,25 +1126,25 @@ ; AVX1-FALLBACK-NEXT: vpsubq %xmm6, %xmm2, %xmm2 ; AVX1-FALLBACK-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpsubq %xmm7, %xmm1, %xmm1 -; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm2, %xmm9 +; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm2, %xmm6 ; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm7 ; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] ; AVX1-FALLBACK-NEXT: vpor %xmm3, %xmm8, %xmm3 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm6 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm7, %xmm6 -; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm6, %xmm1 +; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm9 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 +; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm9, %xmm1 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm7, %xmm3 ; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm2, %xmm2 ; AVX1-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm6 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm9, %xmm6 -; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm6, %xmm2 +; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 +; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm7, %xmm2 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm9, %xmm5 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 ; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2 ; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm5, %xmm2 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 @@ -1179,12 +1179,12 @@ ; XOP-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm2 ; XOP-FALLBACK-NEXT: vpcomgtq %xmm1, %xmm0, %xmm3 ; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm4 -; XOP-FALLBACK-NEXT: vpcomgtq %xmm2, %xmm4, %xmm9 +; XOP-FALLBACK-NEXT: vpcomgtq %xmm2, %xmm4, %xmm5 ; XOP-FALLBACK-NEXT: vpcomltq %xmm2, %xmm4, %xmm6 ; XOP-FALLBACK-NEXT: vblendvpd %xmm6, %xmm4, %xmm2, %xmm6 ; XOP-FALLBACK-NEXT: vpcomltq %xmm1, %xmm0, %xmm7 ; XOP-FALLBACK-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7 -; XOP-FALLBACK-NEXT: vblendvpd %xmm9, %xmm4, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vblendvpd %xmm5, %xmm4, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpsubq %xmm6, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpsubq %xmm7, %xmm1, %xmm1 @@ -1194,13 +1194,13 @@ ; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] ; XOP-FALLBACK-NEXT: vpor %xmm3, %xmm8, %xmm3 ; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm5 -; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm9 +; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 +; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm9, %xmm1 ; XOP-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm7, %xmm3 ; XOP-FALLBACK-NEXT: vpsrlq $33, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpor %xmm8, %xmm9, %xmm5 +; XOP-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7 ; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 @@ -1220,12 +1220,12 @@ ; XOPAVX1-NEXT: vmovdqa 16(%rdi), %xmm2 ; XOPAVX1-NEXT: vpcomgtq %xmm1, %xmm0, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; XOPAVX1-NEXT: vpcomgtq %xmm2, %xmm4, %xmm9 +; XOPAVX1-NEXT: vpcomgtq %xmm2, %xmm4, %xmm5 ; XOPAVX1-NEXT: vpcomltq %xmm2, %xmm4, %xmm6 ; XOPAVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm2, %xmm6 ; XOPAVX1-NEXT: vpcomltq %xmm1, %xmm0, %xmm7 ; XOPAVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7 -; XOPAVX1-NEXT: vblendvpd %xmm9, %xmm4, %xmm2, %xmm2 +; XOPAVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 ; XOPAVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubq %xmm7, %xmm1, %xmm1 @@ -1235,13 +1235,13 @@ ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] ; XOPAVX1-NEXT: vpor %xmm3, %xmm8, %xmm3 ; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 -; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOPAVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; XOPAVX1-NEXT: vpsrlq $32, %xmm3, %xmm9 +; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 +; XOPAVX1-NEXT: vpaddq %xmm1, %xmm9, %xmm1 ; XOPAVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm7, %xmm3 ; XOPAVX1-NEXT: vpsrlq $33, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpor %xmm8, %xmm9, %xmm5 +; XOPAVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsrlq $32, %xmm5, %xmm7 ; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 @@ -1340,25 +1340,25 @@ ; AVX1-FALLBACK-NEXT: vpsubq %xmm6, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vblendvpd %xmm4, %xmm2, %xmm0, %xmm0 ; AVX1-FALLBACK-NEXT: vpsubq %xmm7, %xmm0, %xmm0 -; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm9 +; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm6 ; AVX1-FALLBACK-NEXT: vpsrlq $1, %xmm0, %xmm7 ; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm0, %xmm0 ; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] ; AVX1-FALLBACK-NEXT: vpor %xmm4, %xmm8, %xmm4 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 -; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm4, %xmm6 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm7, %xmm6 -; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm6, %xmm0 +; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm4, %xmm9 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 +; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm9, %xmm0 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm0, %xmm0 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 ; AVX1-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 -; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm6 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm6, %xmm9, %xmm6 -; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm6, %xmm1 +; AVX1-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 +; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm7, %xmm1 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm9, %xmm5 +; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 ; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm5, %xmm1 ; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 @@ -1395,12 +1395,12 @@ ; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 ; XOP-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm3 ; XOP-FALLBACK-NEXT: vpcomgtq %xmm1, %xmm0, %xmm4 -; XOP-FALLBACK-NEXT: vpcomgtq %xmm2, %xmm3, %xmm9 +; XOP-FALLBACK-NEXT: vpcomgtq %xmm2, %xmm3, %xmm5 ; XOP-FALLBACK-NEXT: vpcomltq %xmm2, %xmm3, %xmm6 ; XOP-FALLBACK-NEXT: vblendvpd %xmm6, %xmm3, %xmm2, %xmm6 ; XOP-FALLBACK-NEXT: vpcomltq %xmm1, %xmm0, %xmm7 ; XOP-FALLBACK-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7 -; XOP-FALLBACK-NEXT: vblendvpd %xmm9, %xmm3, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vblendvpd %xmm5, %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpsubq %xmm6, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpsubq %xmm7, %xmm1, %xmm1 @@ -1410,13 +1410,13 @@ ; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] ; XOP-FALLBACK-NEXT: vpor %xmm4, %xmm8, %xmm4 ; XOP-FALLBACK-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm4, %xmm5 -; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm4, %xmm9 +; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 +; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm9, %xmm1 ; XOP-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 ; XOP-FALLBACK-NEXT: vpsrlq $33, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpor %xmm8, %xmm9, %xmm5 +; XOP-FALLBACK-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm5, %xmm7 ; XOP-FALLBACK-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 @@ -1437,12 +1437,12 @@ ; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm0 ; XOPAVX1-NEXT: vmovdqa 16(%rdi), %xmm3 ; XOPAVX1-NEXT: vpcomgtq %xmm1, %xmm0, %xmm4 -; XOPAVX1-NEXT: vpcomgtq %xmm2, %xmm3, %xmm9 +; XOPAVX1-NEXT: vpcomgtq %xmm2, %xmm3, %xmm5 ; XOPAVX1-NEXT: vpcomltq %xmm2, %xmm3, %xmm6 ; XOPAVX1-NEXT: vblendvpd %xmm6, %xmm3, %xmm2, %xmm6 ; XOPAVX1-NEXT: vpcomltq %xmm1, %xmm0, %xmm7 ; XOPAVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm1, %xmm7 -; XOPAVX1-NEXT: vblendvpd %xmm9, %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 ; XOPAVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubq %xmm7, %xmm1, %xmm1 @@ -1452,13 +1452,13 @@ ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1] ; XOPAVX1-NEXT: vpor %xmm4, %xmm8, %xmm4 ; XOPAVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpsrlq $32, %xmm4, %xmm5 -; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 -; XOPAVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; XOPAVX1-NEXT: vpsrlq $32, %xmm4, %xmm9 +; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm9 +; XOPAVX1-NEXT: vpaddq %xmm1, %xmm9, %xmm1 ; XOPAVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpmuludq %xmm4, %xmm7, %xmm4 ; XOPAVX1-NEXT: vpsrlq $33, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpor %xmm8, %xmm9, %xmm5 +; XOPAVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsrlq $32, %xmm5, %xmm7 ; XOPAVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm7 @@ -1700,10 +1700,10 @@ ; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-FALLBACK-NEXT: vpminuw %xmm2, %xmm3, %xmm4 ; AVX1-FALLBACK-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm5 -; AVX1-FALLBACK-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 -; AVX1-FALLBACK-NEXT: vpxor %xmm5, %xmm8, %xmm5 +; AVX1-FALLBACK-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 +; AVX1-FALLBACK-NEXT: vpxor %xmm6, %xmm5, %xmm5 ; AVX1-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm7 -; AVX1-FALLBACK-NEXT: vpcmpeqw %xmm7, %xmm0, %xmm6 +; AVX1-FALLBACK-NEXT: vpcmpeqw %xmm7, %xmm0, %xmm8 ; AVX1-FALLBACK-NEXT: vpxor %xmm6, %xmm8, %xmm6 ; AVX1-FALLBACK-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2 ; AVX1-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 @@ -2317,7 +2317,7 @@ ; AVX1-FALLBACK: # %bb.0: ; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm8 +; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 ; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm5 ; AVX1-FALLBACK-NEXT: vpminsb %xmm3, %xmm2, %xmm6 ; AVX1-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm7 @@ -2333,25 +2333,25 @@ ; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4 -; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4 +; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6 +; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm8, %xmm6 ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm1, %xmm1 -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm1, %xmm1 -; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm1, %xmm1 -; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm5 -; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4 -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4 +; AVX1-FALLBACK-NEXT: vpand %xmm1, %xmm8, %xmm1 +; AVX1-FALLBACK-NEXT: vpackuswb %xmm6, %xmm1, %xmm1 +; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm5, %xmm5 +; AVX1-FALLBACK-NEXT: vpand %xmm5, %xmm8, %xmm5 ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm3, %xmm3 -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm3, %xmm3 +; AVX1-FALLBACK-NEXT: vpand %xmm3, %xmm8, %xmm3 +; AVX1-FALLBACK-NEXT: vpackuswb %xmm5, %xmm3, %xmm3 ; AVX1-FALLBACK-NEXT: vpaddb %xmm2, %xmm3, %xmm2 ; AVX1-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -2383,7 +2383,7 @@ ; XOP-FALLBACK: # %bb.0: ; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOP-FALLBACK-NEXT: vpcomgtb %xmm2, %xmm3, %xmm8 +; XOP-FALLBACK-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpcomgtb %xmm1, %xmm0, %xmm5 ; XOP-FALLBACK-NEXT: vpminsb %xmm2, %xmm3, %xmm6 ; XOP-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm7 @@ -2397,21 +2397,21 @@ ; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4 +; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6 ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; XOP-FALLBACK-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm6 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4 +; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4 +; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm6, %xmm6 ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -2421,7 +2421,7 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOPAVX1-NEXT: vpcomgtb %xmm2, %xmm3, %xmm8 +; XOPAVX1-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpcomgtb %xmm1, %xmm0, %xmm5 ; XOPAVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6 ; XOPAVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm7 @@ -2435,21 +2435,21 @@ ; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOPAVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOPAVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; XOPAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpor %xmm7, %xmm8, %xmm6 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOPAVX1-NEXT: vpor %xmm7, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOPAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6 ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; XOPAVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; XOPAVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -2540,10 +2540,10 @@ ; AVX1-FALLBACK-NEXT: vpminub %xmm3, %xmm2, %xmm4 ; AVX1-FALLBACK-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm5 ; AVX1-FALLBACK-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; AVX1-FALLBACK-NEXT: vpxor %xmm6, %xmm5, %xmm8 -; AVX1-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm7 -; AVX1-FALLBACK-NEXT: vpcmpeqb %xmm7, %xmm0, %xmm5 ; AVX1-FALLBACK-NEXT: vpxor %xmm6, %xmm5, %xmm5 +; AVX1-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm7 +; AVX1-FALLBACK-NEXT: vpcmpeqb %xmm7, %xmm0, %xmm8 +; AVX1-FALLBACK-NEXT: vpxor %xmm6, %xmm8, %xmm6 ; AVX1-FALLBACK-NEXT: vpmaxub %xmm3, %xmm2, %xmm3 ; AVX1-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX1-FALLBACK-NEXT: vpsubb %xmm7, %xmm1, %xmm1 @@ -2554,26 +2554,26 @@ ; AVX1-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-FALLBACK-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4 -; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-FALLBACK-NEXT: vpand %xmm7, %xmm4, %xmm4 +; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm8, %xmm4 +; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-FALLBACK-NEXT: vpand %xmm4, %xmm8, %xmm4 ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm1, %xmm1 -; AVX1-FALLBACK-NEXT: vpand %xmm7, %xmm1, %xmm1 +; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm1, %xmm1 +; AVX1-FALLBACK-NEXT: vpand %xmm1, %xmm8, %xmm1 ; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-FALLBACK-NEXT: vpor %xmm6, %xmm8, %xmm5 +; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5 ; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm4, %xmm4 -; AVX1-FALLBACK-NEXT: vpand %xmm7, %xmm4, %xmm4 +; AVX1-FALLBACK-NEXT: vpand %xmm4, %xmm8, %xmm4 ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm3, %xmm3 -; AVX1-FALLBACK-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-FALLBACK-NEXT: vpand %xmm3, %xmm8, %xmm3 ; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 ; AVX1-FALLBACK-NEXT: vpaddb %xmm2, %xmm3, %xmm2 ; AVX1-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2608,7 +2608,7 @@ ; XOP-FALLBACK: # %bb.0: ; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOP-FALLBACK-NEXT: vpcomgtub %xmm2, %xmm3, %xmm8 +; XOP-FALLBACK-NEXT: vpcomgtub %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpcomgtub %xmm1, %xmm0, %xmm5 ; XOP-FALLBACK-NEXT: vpminub %xmm2, %xmm3, %xmm6 ; XOP-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm7 @@ -2622,21 +2622,21 @@ ; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4 +; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6 ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; XOP-FALLBACK-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm6 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4 +; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4 +; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm6, %xmm6 ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -2646,7 +2646,7 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOPAVX1-NEXT: vpcomgtub %xmm2, %xmm3, %xmm8 +; XOPAVX1-NEXT: vpcomgtub %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpcomgtub %xmm1, %xmm0, %xmm5 ; XOPAVX1-NEXT: vpminub %xmm2, %xmm3, %xmm6 ; XOPAVX1-NEXT: vpminub %xmm1, %xmm0, %xmm7 @@ -2660,21 +2660,21 @@ ; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOPAVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOPAVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; XOPAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpor %xmm7, %xmm8, %xmm6 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOPAVX1-NEXT: vpor %xmm7, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOPAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6 ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; XOPAVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; XOPAVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -2767,7 +2767,7 @@ ; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm8 +; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 ; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm5 ; AVX1-FALLBACK-NEXT: vpminsb %xmm3, %xmm2, %xmm6 ; AVX1-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm7 @@ -2783,25 +2783,25 @@ ; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4 -; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4 +; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6 +; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm8, %xmm6 ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm0, %xmm0 -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 -; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm5 -; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4 -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4 +; AVX1-FALLBACK-NEXT: vpand %xmm0, %xmm8, %xmm0 +; AVX1-FALLBACK-NEXT: vpackuswb %xmm6, %xmm0, %xmm0 +; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm5, %xmm5 +; AVX1-FALLBACK-NEXT: vpand %xmm5, %xmm8, %xmm5 ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm3, %xmm3 -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm3, %xmm3 +; AVX1-FALLBACK-NEXT: vpand %xmm3, %xmm8, %xmm3 +; AVX1-FALLBACK-NEXT: vpackuswb %xmm5, %xmm3, %xmm3 ; AVX1-FALLBACK-NEXT: vpaddb %xmm2, %xmm3, %xmm2 ; AVX1-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -2835,7 +2835,7 @@ ; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; XOP-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-FALLBACK-NEXT: vpcomgtb %xmm2, %xmm3, %xmm8 +; XOP-FALLBACK-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpcomgtb %xmm0, %xmm1, %xmm5 ; XOP-FALLBACK-NEXT: vpminsb %xmm2, %xmm3, %xmm6 ; XOP-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm7 @@ -2849,21 +2849,21 @@ ; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4 +; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6 ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; XOP-FALLBACK-NEXT: vpmullw %xmm5, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm0, %xmm0 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm6 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4 +; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm0, %xmm0 +; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4 +; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm6, %xmm6 ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -2874,7 +2874,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1 ; XOPAVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOPAVX1-NEXT: vpcomgtb %xmm2, %xmm3, %xmm8 +; XOPAVX1-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpcomgtb %xmm0, %xmm1, %xmm5 ; XOPAVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6 ; XOPAVX1-NEXT: vpminsb %xmm0, %xmm1, %xmm7 @@ -2888,21 +2888,21 @@ ; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOPAVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOPAVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; XOPAVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpor %xmm7, %xmm8, %xmm6 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOPAVX1-NEXT: vpor %xmm7, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOPAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6 ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; XOPAVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; XOPAVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -2995,7 +2995,7 @@ ; AVX1-FALLBACK-NEXT: vmovdqa (%rdi), %xmm2 ; AVX1-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX1-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm8 +; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm4 ; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm5 ; AVX1-FALLBACK-NEXT: vpminsb %xmm3, %xmm1, %xmm6 ; AVX1-FALLBACK-NEXT: vpminsb %xmm2, %xmm0, %xmm7 @@ -3011,25 +3011,25 @@ ; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4 -; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4 +; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6 +; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm8, %xmm6 ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm5 -; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4 -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4 +; AVX1-FALLBACK-NEXT: vpand %xmm2, %xmm8, %xmm2 +; AVX1-FALLBACK-NEXT: vpackuswb %xmm6, %xmm2, %xmm2 +; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm5, %xmm5 +; AVX1-FALLBACK-NEXT: vpand %xmm5, %xmm8, %xmm5 ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm3, %xmm3 -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm3, %xmm3 +; AVX1-FALLBACK-NEXT: vpand %xmm3, %xmm8, %xmm3 +; AVX1-FALLBACK-NEXT: vpackuswb %xmm5, %xmm3, %xmm3 ; AVX1-FALLBACK-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; AVX1-FALLBACK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -3063,7 +3063,7 @@ ; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; XOP-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm2 ; XOP-FALLBACK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOP-FALLBACK-NEXT: vpcomgtb %xmm2, %xmm3, %xmm8 +; XOP-FALLBACK-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpcomgtb %xmm1, %xmm0, %xmm5 ; XOP-FALLBACK-NEXT: vpminsb %xmm2, %xmm3, %xmm6 ; XOP-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm7 @@ -3077,21 +3077,21 @@ ; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4 +; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6 ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; XOP-FALLBACK-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm6 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4 +; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4 +; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm6, %xmm6 ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -3102,7 +3102,7 @@ ; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1 ; XOPAVX1-NEXT: vmovdqa 16(%rdi), %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOPAVX1-NEXT: vpcomgtb %xmm2, %xmm3, %xmm8 +; XOPAVX1-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpcomgtb %xmm1, %xmm0, %xmm5 ; XOPAVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6 ; XOPAVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm7 @@ -3116,21 +3116,21 @@ ; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOPAVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOPAVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; XOPAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpor %xmm7, %xmm8, %xmm6 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOPAVX1-NEXT: vpor %xmm7, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOPAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6 ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; XOPAVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; XOPAVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -3224,7 +3224,7 @@ ; AVX1-FALLBACK-NEXT: vmovdqa 16(%rsi), %xmm3 ; AVX1-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm8 +; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm4 ; AVX1-FALLBACK-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm5 ; AVX1-FALLBACK-NEXT: vpminsb %xmm3, %xmm1, %xmm6 ; AVX1-FALLBACK-NEXT: vpminsb %xmm2, %xmm0, %xmm7 @@ -3240,25 +3240,25 @@ ; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4 -; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4 +; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6 +; AVX1-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm8, %xmm6 ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm5 -; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4 -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm4, %xmm4 +; AVX1-FALLBACK-NEXT: vpand %xmm2, %xmm8, %xmm2 +; AVX1-FALLBACK-NEXT: vpackuswb %xmm6, %xmm2, %xmm2 +; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-FALLBACK-NEXT: vpmullw %xmm6, %xmm5, %xmm5 +; AVX1-FALLBACK-NEXT: vpand %xmm5, %xmm8, %xmm5 ; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-FALLBACK-NEXT: vpmullw %xmm5, %xmm3, %xmm3 -; AVX1-FALLBACK-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX1-FALLBACK-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-FALLBACK-NEXT: vpmullw %xmm4, %xmm3, %xmm3 +; AVX1-FALLBACK-NEXT: vpand %xmm3, %xmm8, %xmm3 +; AVX1-FALLBACK-NEXT: vpackuswb %xmm5, %xmm3, %xmm3 ; AVX1-FALLBACK-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; AVX1-FALLBACK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -3294,7 +3294,7 @@ ; XOP-FALLBACK-NEXT: vmovdqa 16(%rsi), %xmm1 ; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm2 ; XOP-FALLBACK-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-FALLBACK-NEXT: vpcomgtb %xmm1, %xmm3, %xmm8 +; XOP-FALLBACK-NEXT: vpcomgtb %xmm1, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpcomgtb %xmm0, %xmm2, %xmm5 ; XOP-FALLBACK-NEXT: vpminsb %xmm1, %xmm3, %xmm6 ; XOP-FALLBACK-NEXT: vpminsb %xmm0, %xmm2, %xmm7 @@ -3308,21 +3308,21 @@ ; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm5, %xmm5 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm6, %xmm4 +; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm8, %xmm6 ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; XOP-FALLBACK-NEXT: vpmullw %xmm5, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm0, %xmm0 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm8, %xmm6 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm4, %xmm4 +; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm0, %xmm0 +; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-FALLBACK-NEXT: vpor %xmm7, %xmm4, %xmm4 +; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOP-FALLBACK-NEXT: vpmullw %xmm7, %xmm6, %xmm6 ; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; XOP-FALLBACK-NEXT: vpmullw %xmm6, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -3334,7 +3334,7 @@ ; XOPAVX1-NEXT: vmovdqa 16(%rsi), %xmm1 ; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm2 ; XOPAVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOPAVX1-NEXT: vpcomgtb %xmm1, %xmm3, %xmm8 +; XOPAVX1-NEXT: vpcomgtb %xmm1, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpcomgtb %xmm0, %xmm2, %xmm5 ; XOPAVX1-NEXT: vpminsb %xmm1, %xmm3, %xmm6 ; XOPAVX1-NEXT: vpminsb %xmm0, %xmm2, %xmm7 @@ -3348,21 +3348,21 @@ ; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; XOPAVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOPAVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; XOPAVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpor %xmm7, %xmm8, %xmm6 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOPAVX1-NEXT: vpor %xmm7, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; XOPAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6 ; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; XOPAVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpperm %xmm5, %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; XOPAVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/misched-matmul.ll b/llvm/test/CodeGen/X86/misched-matmul.ll --- a/llvm/test/CodeGen/X86/misched-matmul.ll +++ b/llvm/test/CodeGen/X86/misched-matmul.ll @@ -10,7 +10,7 @@ ; more complex cases. ; ; CHECK: @wrap_mul4 -; CHECK: 25 regalloc - Number of spills inserted +; CHECK: 24 regalloc - Number of spills inserted define void @wrap_mul4(ptr nocapture %Out, ptr nocapture %A, ptr nocapture %B) #0 { entry: diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -426,19 +426,19 @@ ; ; X64-LABEL: test3: ; X64: # %bb.0: # %entry -; X64-NEXT: xorl %r8d, %r8d +; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: testl %edx, %edx ; X64-NEXT: je .LBB3_2 ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB3_1: # %bb26 ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movslq %r8d, %r8 -; X64-NEXT: movq (%rdi,%r8,8), %rcx -; X64-NEXT: addq (%rsi,%r8,8), %rcx -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: incl %r8d -; X64-NEXT: cmpl %edx, %r8d +; X64-NEXT: movslq %ecx, %rcx +; X64-NEXT: movq (%rdi,%rcx,8), %r8 +; X64-NEXT: addq (%rsi,%rcx,8), %r8 +; X64-NEXT: addq %r8, %rax +; X64-NEXT: incl %ecx +; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: jb .LBB3_1 ; X64-NEXT: .LBB3_2: # %bb31 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/mul-constant-result.ll b/llvm/test/CodeGen/X86/mul-constant-result.ll --- a/llvm/test/CodeGen/X86/mul-constant-result.ll +++ b/llvm/test/CodeGen/X86/mul-constant-result.ll @@ -898,21 +898,21 @@ ; X64-HSW-NEXT: movl $5, %edi ; X64-HSW-NEXT: movl $2, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %r14d -; X64-HSW-NEXT: xorl $5, %r14d +; X64-HSW-NEXT: movl %eax, %ebp +; X64-HSW-NEXT: xorl $5, %ebp ; X64-HSW-NEXT: movl $6, %edi ; X64-HSW-NEXT: movl $3, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebp -; X64-HSW-NEXT: xorl $6, %ebp -; X64-HSW-NEXT: orl %r14d, %ebp +; X64-HSW-NEXT: movl %eax, %r14d +; X64-HSW-NEXT: xorl $6, %r14d +; X64-HSW-NEXT: orl %ebp, %r14d ; X64-HSW-NEXT: movl $7, %edi ; X64-HSW-NEXT: movl $3, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %r14d -; X64-HSW-NEXT: xorl $7, %r14d -; X64-HSW-NEXT: orl %ebp, %r14d -; X64-HSW-NEXT: orl %ebx, %r14d +; X64-HSW-NEXT: movl %eax, %ebp +; X64-HSW-NEXT: xorl $7, %ebp +; X64-HSW-NEXT: orl %r14d, %ebp +; X64-HSW-NEXT: orl %ebx, %ebp ; X64-HSW-NEXT: movl $8, %edi ; X64-HSW-NEXT: movl $4, %esi ; X64-HSW-NEXT: callq mult@PLT @@ -921,88 +921,88 @@ ; X64-HSW-NEXT: movl $9, %edi ; X64-HSW-NEXT: movl $4, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebp -; X64-HSW-NEXT: xorl $9, %ebp -; X64-HSW-NEXT: orl %ebx, %ebp +; X64-HSW-NEXT: movl %eax, %r14d +; X64-HSW-NEXT: xorl $9, %r14d +; X64-HSW-NEXT: orl %ebx, %r14d ; X64-HSW-NEXT: movl $10, %edi ; X64-HSW-NEXT: movl $5, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebx -; X64-HSW-NEXT: xorl $10, %ebx -; X64-HSW-NEXT: orl %ebp, %ebx +; X64-HSW-NEXT: movl %eax, %r15d +; X64-HSW-NEXT: xorl $10, %r15d +; X64-HSW-NEXT: orl %r14d, %r15d ; X64-HSW-NEXT: movl $11, %edi ; X64-HSW-NEXT: movl $5, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %r15d -; X64-HSW-NEXT: xorl $11, %r15d -; X64-HSW-NEXT: orl %ebx, %r15d -; X64-HSW-NEXT: orl %r14d, %r15d +; X64-HSW-NEXT: movl %eax, %ebx +; X64-HSW-NEXT: xorl $11, %ebx +; X64-HSW-NEXT: orl %r15d, %ebx +; X64-HSW-NEXT: orl %ebp, %ebx ; X64-HSW-NEXT: movl $12, %edi ; X64-HSW-NEXT: movl $6, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebx -; X64-HSW-NEXT: xorl $12, %ebx +; X64-HSW-NEXT: movl %eax, %ebp +; X64-HSW-NEXT: xorl $12, %ebp ; X64-HSW-NEXT: movl $13, %edi ; X64-HSW-NEXT: movl $6, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebp -; X64-HSW-NEXT: xorl $13, %ebp -; X64-HSW-NEXT: orl %ebx, %ebp +; X64-HSW-NEXT: movl %eax, %r14d +; X64-HSW-NEXT: xorl $13, %r14d +; X64-HSW-NEXT: orl %ebp, %r14d ; X64-HSW-NEXT: movl $14, %edi ; X64-HSW-NEXT: movl $7, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebx -; X64-HSW-NEXT: xorl $14, %ebx -; X64-HSW-NEXT: orl %ebp, %ebx +; X64-HSW-NEXT: movl %eax, %ebp +; X64-HSW-NEXT: xorl $14, %ebp +; X64-HSW-NEXT: orl %r14d, %ebp ; X64-HSW-NEXT: movl $15, %edi ; X64-HSW-NEXT: movl $7, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebp -; X64-HSW-NEXT: xorl $15, %ebp -; X64-HSW-NEXT: orl %ebx, %ebp +; X64-HSW-NEXT: movl %eax, %r14d +; X64-HSW-NEXT: xorl $15, %r14d +; X64-HSW-NEXT: orl %ebp, %r14d ; X64-HSW-NEXT: movl $16, %edi ; X64-HSW-NEXT: movl $8, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %r14d -; X64-HSW-NEXT: xorl $16, %r14d -; X64-HSW-NEXT: orl %ebp, %r14d -; X64-HSW-NEXT: orl %r15d, %r14d +; X64-HSW-NEXT: movl %eax, %ebp +; X64-HSW-NEXT: xorl $16, %ebp +; X64-HSW-NEXT: orl %r14d, %ebp +; X64-HSW-NEXT: orl %ebx, %ebp ; X64-HSW-NEXT: movl $17, %edi ; X64-HSW-NEXT: movl $8, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebp -; X64-HSW-NEXT: xorl $17, %ebp +; X64-HSW-NEXT: movl %eax, %ebx +; X64-HSW-NEXT: xorl $17, %ebx ; X64-HSW-NEXT: movl $18, %edi ; X64-HSW-NEXT: movl $9, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebx -; X64-HSW-NEXT: xorl $18, %ebx -; X64-HSW-NEXT: orl %ebp, %ebx +; X64-HSW-NEXT: movl %eax, %r14d +; X64-HSW-NEXT: xorl $18, %r14d +; X64-HSW-NEXT: orl %ebx, %r14d ; X64-HSW-NEXT: movl $19, %edi ; X64-HSW-NEXT: movl $9, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebp -; X64-HSW-NEXT: xorl $19, %ebp -; X64-HSW-NEXT: orl %ebx, %ebp +; X64-HSW-NEXT: movl %eax, %ebx +; X64-HSW-NEXT: xorl $19, %ebx +; X64-HSW-NEXT: orl %r14d, %ebx ; X64-HSW-NEXT: movl $20, %edi ; X64-HSW-NEXT: movl $10, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebx -; X64-HSW-NEXT: xorl $20, %ebx -; X64-HSW-NEXT: orl %ebp, %ebx +; X64-HSW-NEXT: movl %eax, %r14d +; X64-HSW-NEXT: xorl $20, %r14d +; X64-HSW-NEXT: orl %ebx, %r14d ; X64-HSW-NEXT: movl $21, %edi ; X64-HSW-NEXT: movl $10, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebp -; X64-HSW-NEXT: xorl $21, %ebp -; X64-HSW-NEXT: orl %ebx, %ebp +; X64-HSW-NEXT: movl %eax, %r15d +; X64-HSW-NEXT: xorl $21, %r15d +; X64-HSW-NEXT: orl %r14d, %r15d ; X64-HSW-NEXT: movl $22, %edi ; X64-HSW-NEXT: movl $11, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %r15d -; X64-HSW-NEXT: xorl $22, %r15d -; X64-HSW-NEXT: orl %ebp, %r15d -; X64-HSW-NEXT: orl %r14d, %r15d +; X64-HSW-NEXT: movl %eax, %ebx +; X64-HSW-NEXT: xorl $22, %ebx +; X64-HSW-NEXT: orl %r15d, %ebx +; X64-HSW-NEXT: orl %ebp, %ebx ; X64-HSW-NEXT: movl $23, %edi ; X64-HSW-NEXT: movl $11, %esi ; X64-HSW-NEXT: callq mult@PLT @@ -1011,58 +1011,58 @@ ; X64-HSW-NEXT: movl $24, %edi ; X64-HSW-NEXT: movl $12, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebx -; X64-HSW-NEXT: xorl $24, %ebx -; X64-HSW-NEXT: orl %ebp, %ebx +; X64-HSW-NEXT: movl %eax, %r14d +; X64-HSW-NEXT: xorl $24, %r14d +; X64-HSW-NEXT: orl %ebp, %r14d ; X64-HSW-NEXT: movl $25, %edi ; X64-HSW-NEXT: movl $12, %esi ; X64-HSW-NEXT: callq mult@PLT ; X64-HSW-NEXT: movl %eax, %ebp ; X64-HSW-NEXT: xorl $25, %ebp -; X64-HSW-NEXT: orl %ebx, %ebp +; X64-HSW-NEXT: orl %r14d, %ebp ; X64-HSW-NEXT: movl $26, %edi ; X64-HSW-NEXT: movl $13, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebx -; X64-HSW-NEXT: xorl $26, %ebx -; X64-HSW-NEXT: orl %ebp, %ebx +; X64-HSW-NEXT: movl %eax, %r14d +; X64-HSW-NEXT: xorl $26, %r14d +; X64-HSW-NEXT: orl %ebp, %r14d ; X64-HSW-NEXT: movl $27, %edi ; X64-HSW-NEXT: movl $13, %esi ; X64-HSW-NEXT: callq mult@PLT ; X64-HSW-NEXT: movl %eax, %ebp ; X64-HSW-NEXT: xorl $27, %ebp -; X64-HSW-NEXT: orl %ebx, %ebp +; X64-HSW-NEXT: orl %r14d, %ebp ; X64-HSW-NEXT: movl $28, %edi ; X64-HSW-NEXT: movl $14, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebx -; X64-HSW-NEXT: xorl $28, %ebx -; X64-HSW-NEXT: orl %ebp, %ebx +; X64-HSW-NEXT: movl %eax, %r14d +; X64-HSW-NEXT: xorl $28, %r14d +; X64-HSW-NEXT: orl %ebp, %r14d ; X64-HSW-NEXT: movl $29, %edi ; X64-HSW-NEXT: movl $14, %esi ; X64-HSW-NEXT: callq mult@PLT ; X64-HSW-NEXT: movl %eax, %ebp ; X64-HSW-NEXT: xorl $29, %ebp +; X64-HSW-NEXT: orl %r14d, %ebp ; X64-HSW-NEXT: orl %ebx, %ebp -; X64-HSW-NEXT: orl %r15d, %ebp ; X64-HSW-NEXT: movl $30, %edi ; X64-HSW-NEXT: movl $15, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %r14d -; X64-HSW-NEXT: xorl $30, %r14d +; X64-HSW-NEXT: movl %eax, %ebx +; X64-HSW-NEXT: xorl $30, %ebx ; X64-HSW-NEXT: movl $31, %edi ; X64-HSW-NEXT: movl $15, %esi ; X64-HSW-NEXT: callq mult@PLT -; X64-HSW-NEXT: movl %eax, %ebx -; X64-HSW-NEXT: xorl $31, %ebx -; X64-HSW-NEXT: orl %r14d, %ebx -; X64-HSW-NEXT: orl %ebp, %ebx +; X64-HSW-NEXT: movl %eax, %r14d +; X64-HSW-NEXT: xorl $31, %r14d +; X64-HSW-NEXT: orl %ebx, %r14d +; X64-HSW-NEXT: orl %ebp, %r14d ; X64-HSW-NEXT: movl $32, %edi ; X64-HSW-NEXT: movl $16, %esi ; X64-HSW-NEXT: callq mult@PLT ; X64-HSW-NEXT: xorl $32, %eax ; X64-HSW-NEXT: xorl %ecx, %ecx -; X64-HSW-NEXT: orl %ebx, %eax +; X64-HSW-NEXT: orl %r14d, %eax ; X64-HSW-NEXT: setne %cl ; X64-HSW-NEXT: negl %ecx ; X64-HSW-NEXT: movl %ecx, %eax diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll --- a/llvm/test/CodeGen/X86/mul-i1024.ll +++ b/llvm/test/CodeGen/X86/mul-i1024.ll @@ -4804,281 +4804,282 @@ ; X64-NEXT: pushq %rbx ; X64-NEXT: subq $240, %rsp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rsi, %rbp ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq 40(%rdi), %r15 -; X64-NEXT: movq 32(%rdi), %r9 -; X64-NEXT: movq 56(%rdi), %r8 -; X64-NEXT: movq 48(%rdi), %rbx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq (%rsi), %rdi -; X64-NEXT: movq 8(%rsi), %r11 -; X64-NEXT: movq %rsi, %r13 -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: movq 40(%rdi), %r12 +; X64-NEXT: movq 32(%rdi), %r14 +; X64-NEXT: movq 56(%rdi), %r15 +; X64-NEXT: movq 48(%rdi), %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq (%rsi), %r11 +; X64-NEXT: movq 8(%rsi), %r8 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rbp, %r10 -; X64-NEXT: adcq %rsi, %rcx +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %r9, %rcx +; X64-NEXT: adcq %rsi, %r10 ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %esi -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: adcq %rsi, %r8 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %al, %r9d ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: addq %r10, %rsi +; X64-NEXT: adcq %r9, %r13 +; X64-NEXT: movq %r14, %rax ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: addq %rdi, %rax +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbp, %rbx -; X64-NEXT: setb %sil -; X64-NEXT: movq %r15, %rbp -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rbx, %rcx -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: addq %r12, %rcx -; X64-NEXT: adcq %r10, %r15 -; X64-NEXT: adcq $0, %r14 -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r13, %rdi -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq 16(%r13), %r10 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq %rbp, %r11 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r9, %r11 +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: addq %r11, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r10, %r9 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r9, %rbx +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: adcq %rax, %r11 +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: adcq %rcx, %r11 +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: adcq $0, %r13 ; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r12, %rsi -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq 24(%rdi), %rdi -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq 16(%rbp), %r15 +; X64-NEXT: movq %r14, %r10 +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r12, %rcx +; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rdi, %r14 +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq 24(%rbp), %rdi +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r12 -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: addq %rsi, %rax -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: adcq %rbx, %rbp -; X64-NEXT: setb %r9b -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %rdi, %rbp +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: adcq %r9, %r12 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rbp, %rbx -; X64-NEXT: movzbl %r9b, %eax +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %r12, %r9 +; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: addq %rcx, %r13 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %rsi -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %rbx +; X64-NEXT: addq %rbx, %r8 +; X64-NEXT: movq %r8, (%rsp) # 8-byte Spill +; X64-NEXT: adcq %r11, %r14 +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq $0, %r9 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: addq %r14, %rbx -; X64-NEXT: adcq %r8, %rdi -; X64-NEXT: setb %r11b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: addq %rsi, %r9 +; X64-NEXT: adcq %r13, %rdi +; X64-NEXT: setb %r10b +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r14, %rbp +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rcx, %r11 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %rbp, %rax -; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %r11, %rax +; X64-NEXT: movq %rax, %r11 ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %rbp ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: movzbl %sil, %ecx ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %rbx, %r15 -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rdi, %rbp -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %r11b, %ecx +; X64-NEXT: addq %r9, %r14 +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rdi, %r11 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %r10b, %ecx ; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq 16(%rsi), %rcx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq 24(%rsi), %rbx -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 +; X64-NEXT: movq 16(%r8), %rsi +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq 24(%r8), %r14 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %r13, %r15 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r9, %rbp +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rcx, %r11 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload ; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rbp, %r15 -; X64-NEXT: adcq %rdi, %rcx -; X64-NEXT: setb %dil -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: addq %r11, %rsi +; X64-NEXT: adcq %rdi, %rbx +; X64-NEXT: setb %r10b +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r13 +; X64-NEXT: movq %r13, %r12 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rbx, %rdi +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: adcq %rax, %rcx +; X64-NEXT: movq (%r8), %r13 +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rcx, %r12 -; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: movq 8(%r8), %rbp +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r11, %r14 +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %r12, %r11 +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rbx, %r12 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r12, %rbx +; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %r11 -; X64-NEXT: movq (%rsi), %rdi +; X64-NEXT: addq %r9, %rbx +; X64-NEXT: adcq %rsi, %r11 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: movq %r13, %r10 +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rsi, %r14 +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: adcq %r9, %r12 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: addq %r12, %rsi +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: adcq %rax, %r9 +; X64-NEXT: addq %rbx, %r13 +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r11, %r14 +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: addq %rdi, %rsi +; X64-NEXT: adcq %rcx, %r9 +; X64-NEXT: setb %r10b +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 8(%rsi), %r9 -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: addq %rbx, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbp, %rsi -; X64-NEXT: setb %bl -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rsi, %rcx -; X64-NEXT: movzbl %bl, %eax -; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: addq %r14, %rcx -; X64-NEXT: adcq %r15, %r13 -; X64-NEXT: adcq $0, %r12 ; X64-NEXT: adcq $0, %r11 -; X64-NEXT: movq %rdi, %rbp -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rsi, %rdi -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: addq %rdi, %rax -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: adcq %rbx, %rbp -; X64-NEXT: setb %r15b -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rbp, %rbx -; X64-NEXT: movzbl %r15b, %eax -; X64-NEXT: adcq %rax, %r8 -; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r13, %rdi -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: addq %r12, %rbx -; X64-NEXT: adcq %r11, %r8 -; X64-NEXT: setb %r14b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %rsi, %r15 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rbp, %r10 -; X64-NEXT: adcq %rdi, %rcx -; X64-NEXT: setb %dil -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %rbx, %r11 -; X64-NEXT: adcq %r8, %r10 -; X64-NEXT: movzbl %r14b, %ecx -; X64-NEXT: adcq %rcx, %rax +; X64-NEXT: movq %rdi, %r8 +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %rbx, %rcx +; X64-NEXT: adcq %r11, %r14 +; X64-NEXT: setb %r11b +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: addq %r14, %rax +; X64-NEXT: movzbl %r11b, %edi +; X64-NEXT: adcq %rdi, %rdx +; X64-NEXT: addq %rsi, %r13 +; X64-NEXT: adcq %r9, %rcx +; X64-NEXT: movzbl %r10b, %esi +; X64-NEXT: adcq %rsi, %rax ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq (%rsp), %rax # 8-byte Folded Reload ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -5086,281 +5087,285 @@ ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq 32(%rsi), %rdi -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r9, %rbx -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq 32(%rcx), %rdi +; X64-NEXT: movq %r8, %r10 +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r11 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq 40(%rsi), %rcx -; X64-NEXT: movq %rsi, %r13 -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rbp, %r9 -; X64-NEXT: adcq %rdi, %rcx -; X64-NEXT: setb %sil -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: adcq %rax, %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %r8 ; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r11, (%rsp) # 8-byte Spill -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdi, %r14 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rsi, %r11 +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq 40(%rcx), %rsi +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rsi, %rdi +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %rcx, %rsi -; X64-NEXT: adcq $0, %rdi +; X64-NEXT: addq %r11, %rsi +; X64-NEXT: adcq %r9, %rbx +; X64-NEXT: setb %r10b ; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: addq %rsi, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rbx, %r11 +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: adcq %rax, %r9 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rdi, %rbp -; X64-NEXT: setb %sil -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rbp, %rcx -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: adcq %rax, %r8 -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: adcq %r9, %r8 +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %rbx, %r13 ; X64-NEXT: adcq $0, %r14 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq 48(%r13), %r11 -; X64-NEXT: movq %r12, %rdi ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r12, %rbx -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq 56(%r13), %rsi +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: addq %r13, %rax +; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill +; X64-NEXT: adcq %r14, %r10 +; X64-NEXT: setb %r15b +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r10, %r14 +; X64-NEXT: movzbl %r15b, %eax +; X64-NEXT: adcq %rax, %rbx +; X64-NEXT: addq %r8, %r14 +; X64-NEXT: adcq %rsi, %rbx +; X64-NEXT: adcq $0, %r11 +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: movq 48(%rcx), %rcx +; X64-NEXT: movq %r12, %r15 +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %rbp ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rbx, %r13 -; X64-NEXT: adcq %rbp, %rdi -; X64-NEXT: setb %bl +; X64-NEXT: addq %rsi, %r13 +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq 56(%r8), %rsi ; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r15 +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r13, %rdi +; X64-NEXT: adcq %r10, %r15 +; X64-NEXT: setb %r8b +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rdi, %r12 -; X64-NEXT: movzbl %bl, %eax +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r15, %r13 +; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %rsi -; X64-NEXT: addq %rcx, %r9 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r8, %r13 -; X64-NEXT: adcq $0, %r12 +; X64-NEXT: addq %r14, %rbp +; X64-NEXT: movq %rbp, %r8 +; X64-NEXT: adcq %rbx, %rdi +; X64-NEXT: adcq $0, %r13 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: addq %r14, %r12 -; X64-NEXT: adcq %r10, %rsi -; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r11 +; X64-NEXT: addq %r11, %r13 +; X64-NEXT: adcq %r9, %rsi +; X64-NEXT: setb %bpl +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r9, %rbp -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rbp, %r9 -; X64-NEXT: adcq %rdi, %rcx -; X64-NEXT: setb %dil -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: adcq %rax, %r10 -; X64-NEXT: addq %r12, %r8 -; X64-NEXT: adcq %rsi, %r9 -; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r9, %rbx ; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rbx, %r9 +; X64-NEXT: adcq %r10, %r15 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r15, %rbx +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: addq %r13, %r12 +; X64-NEXT: adcq %rsi, %r9 +; X64-NEXT: movzbl %bpl, %eax +; X64-NEXT: adcq %rax, %rbx +; X64-NEXT: adcq $0, %r14 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %r8 +; X64-NEXT: adcq %rax, (%rsp) # 8-byte Folded Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq $0, %r12 ; X64-NEXT: adcq $0, %r9 +; X64-NEXT: adcq $0, %rbx ; X64-NEXT: adcq $0, %r14 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload ; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload ; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq (%rsp), %rsi # 8-byte Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r8 +; X64-NEXT: movq %rsi, %r13 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rcx, %r10 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rbx, %r12 -; X64-NEXT: adcq %rsi, %rcx -; X64-NEXT: setb %bl -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r9 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rcx, %r13 -; X64-NEXT: movzbl %bl, %eax -; X64-NEXT: adcq %rax, %r11 +; X64-NEXT: addq %r10, %r12 +; X64-NEXT: adcq %rsi, %r15 +; X64-NEXT: setb %r8b +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: addq %r15, %rsi +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %r9 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload ; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r10, %r13 +; X64-NEXT: adcq $0, %r15 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, %r11 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: addq %r13, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: adcq $0, %rbx +; X64-NEXT: adcq %r15, %r10 +; X64-NEXT: setb %r8b ; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq %rbp, %r8 -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: addq %rdi, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbx, %rbp -; X64-NEXT: setb %bl -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rbp, %rcx -; X64-NEXT: movzbl %bl, %eax -; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: adcq %r12, %r15 -; X64-NEXT: adcq $0, %r13 -; X64-NEXT: adcq $0, %r11 -; X64-NEXT: movq %r8, %rbx +; X64-NEXT: movq %rbp, %r15 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %r10, %rbp +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %r13 +; X64-NEXT: addq %rdi, %rbp +; X64-NEXT: adcq %r12, %r13 +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %r11, %r8 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r15, %r12 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: adcq $0, %r10 ; X64-NEXT: movq %r8, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %rsi, %r9 -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r8, %rsi -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rsi, %r8 -; X64-NEXT: adcq %rbp, %rbx -; X64-NEXT: setb %sil -; X64-NEXT: movq %r9, %rax +; X64-NEXT: addq %r15, %rax +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: adcq %r10, %r8 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rbx, %rbp -; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %r8, %r12 +; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: addq %rcx, %r12 -; X64-NEXT: adcq %r15, %r8 -; X64-NEXT: adcq $0, %rbp +; X64-NEXT: addq %rbp, %r11 +; X64-NEXT: adcq %r13, %r15 +; X64-NEXT: movq %r15, %rbp +; X64-NEXT: adcq $0, %r12 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: addq %r13, %rbp -; X64-NEXT: adcq %r11, %rdi +; X64-NEXT: addq %rsi, %r12 +; X64-NEXT: adcq %r9, %rdi ; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %rcx, %rsi +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: movq %r13, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r15, %rbx +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rcx, %r8 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: addq %rbx, %rax -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: adcq %rsi, %r11 -; X64-NEXT: setb %bl -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: addq %r11, %rax -; X64-NEXT: movzbl %bl, %ecx +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: addq %r8, %rax +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: adcq %rsi, %rcx +; X64-NEXT: setb %sil +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: movzbl %sil, %ecx ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %rbp, %r13 -; X64-NEXT: adcq %rdi, %r15 +; X64-NEXT: addq %r12, %r10 +; X64-NEXT: adcq %rdi, %r8 ; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: movq %rax, %rcx @@ -5369,555 +5374,559 @@ ; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: adcq %r14, %r12 -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r10, %r8 -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rbx, %r11 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r14, %rbp +; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rax, %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq $0, %r8 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rcx ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq 64(%rsi), %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r8 +; X64-NEXT: movq 64(%r9), %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rcx, %r15 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rsi, %r8 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq 72(%rsi), %rcx -; X64-NEXT: movq %rsi, %r13 -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %rsi -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rbp, %r14 -; X64-NEXT: adcq %rdi, %rcx -; X64-NEXT: setb %bl -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq 72(%r9), %rsi +; X64-NEXT: movq %r9, %rcx +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r12 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rcx, %r15 -; X64-NEXT: movzbl %bl, %eax -; X64-NEXT: adcq %rax, %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %r8, %rbx +; X64-NEXT: adcq %rdi, %r10 +; X64-NEXT: setb %r8b +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %r10, %r9 +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rdi, %rsi -; X64-NEXT: setb %cl -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rsi, %rdi -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %r11, %rdi -; X64-NEXT: adcq %r14, %r12 -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq 80(%r13), %r11 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq %rbp, %rsi -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq 88(%r13), %r13 -; X64-NEXT: movq %r9, %rax +; X64-NEXT: addq %r8, %r14 +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r12, %rdi ; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %rbp, %rax -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: adcq %rbx, %rcx -; X64-NEXT: setb %r9b -; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r10, %r8 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r15, %r12 ; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %rcx, %rsi -; X64-NEXT: movzbl %r9b, %eax -; X64-NEXT: adcq %rax, %rbx -; X64-NEXT: addq %rdi, %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r12, %rbp -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: addq %r15, %rsi -; X64-NEXT: adcq %r10, %rbx -; X64-NEXT: setb %r9b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r14, %rbp -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %rbp, %rax -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: adcq %rdi, %rcx -; X64-NEXT: setb %dil -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, %r14 -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rcx, %r12 -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: adcq %rax, %rdx -; X64-NEXT: addq %rsi, %r10 -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbx, %rbp -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %r9b, %eax +; X64-NEXT: addq %r8, %rbp +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: addq %r11, %rbp +; X64-NEXT: adcq %rbx, %r14 +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %rcx, %rbx +; X64-NEXT: movq 80(%rcx), %r15 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r8, %r11 +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq 88(%rbx), %rbx +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: addq %r11, %rax +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: adcq %r10, %r8 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r8, %r13 +; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: adcq $0, %rdx -; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq (%rsp), %rax # 8-byte Reload -; X64-NEXT: imulq %rax, %r13 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %r13, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: imulq %rdi, %r11 -; X64-NEXT: addq %rdx, %r11 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: imulq %rbx, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: imulq %rbp, %r8 -; X64-NEXT: addq %rdx, %r8 -; X64-NEXT: addq %r10, %r9 -; X64-NEXT: adcq %r11, %r8 -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: addq %rbp, %rcx +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r14, %r11 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq $0, %r13 +; X64-NEXT: adcq $0, %r12 +; X64-NEXT: addq %r9, %r13 +; X64-NEXT: adcq %rsi, %r12 +; X64-NEXT: setb %bpl +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %rbx, %r11 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rbx, %r10 -; X64-NEXT: adcq %rsi, %rbp -; X64-NEXT: setb %cl +; X64-NEXT: addq %rdi, %r10 +; X64-NEXT: adcq $0, %r8 ; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rbp, %r11 -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: addq %r9, %r11 -; X64-NEXT: adcq %r8, %r13 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq 120(%rdx), %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq 112(%rdx), %rsi +; X64-NEXT: movq %r11, %r14 +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: imulq %r8, %rsi -; X64-NEXT: addq %rdx, %rsi -; X64-NEXT: movq 96(%rdi), %rcx -; X64-NEXT: movq 104(%rdi), %rbp -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: imulq %rbp, %rdi -; X64-NEXT: mulq %rcx +; X64-NEXT: addq %r10, %rax ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: imulq %rcx, %r14 -; X64-NEXT: addq %rdx, %r14 -; X64-NEXT: addq %r15, %r9 -; X64-NEXT: adcq %rsi, %r14 -; X64-NEXT: movq %r14, %r15 +; X64-NEXT: adcq %r8, %rdi +; X64-NEXT: setb %r8b ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rsi, %rdi -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %rdi, %rsi -; X64-NEXT: adcq %rbx, %rcx -; X64-NEXT: setb %bl -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movzbl %bl, %ecx +; X64-NEXT: addq %rdi, %rax +; X64-NEXT: movzbl %r8b, %ecx ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %r9, %rax -; X64-NEXT: adcq %r15, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; X64-NEXT: adcq %r10, %rsi -; X64-NEXT: adcq %r11, %rax -; X64-NEXT: adcq %r13, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: addq %r13, %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r12, %rax +; X64-NEXT: adcq %r12, %r9 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %bpl, %ecx +; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload +; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq 80(%r9), %rbp -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 88(%r9), %rbx -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: imulq %rax, %rbx +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rbx, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: imulq %rcx, %r15 +; X64-NEXT: addq %rdx, %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rdi, %r10 -; X64-NEXT: adcq %rsi, %rbp -; X64-NEXT: setb %sil -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: imulq %rsi, %r10 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r10, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: imulq %r9, %rbx +; X64-NEXT: addq %rdx, %rbx +; X64-NEXT: addq %r8, %rdi +; X64-NEXT: adcq %r15, %rbx +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r8, %r15 +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %rbx ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rbp, %r11 -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: adcq %rax, %r8 -; X64-NEXT: movq 64(%r9), %r13 -; X64-NEXT: movq %r13, %rax -; X64-NEXT: movq %r14, %rdi -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq 72(%r9), %rax -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r14, %rbp -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %r13, %rax -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %r15, %r11 +; X64-NEXT: adcq %r10, %r8 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: addq %rbp, %rax -; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill -; X64-NEXT: adcq %rdi, %rsi -; X64-NEXT: setb %cl -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rsi, %r12 -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: adcq %rax, %r14 -; X64-NEXT: addq %r15, %r12 -; X64-NEXT: adcq %r10, %r14 -; X64-NEXT: adcq $0, %r11 -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r13, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r8, %r15 +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: adcq %rax, %rsi +; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: adcq %rbx, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; X64-NEXT: movq 120(%rdx), %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: imulq %rax, %rdi +; X64-NEXT: movq 112(%rdx), %rbx +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rax, %rbp ; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rdi, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: imulq %rcx, %rbx +; X64-NEXT: addq %rdx, %rbx +; X64-NEXT: movq 96(%r12), %r10 +; X64-NEXT: movq 104(%r12), %rdi +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r14, %r12 +; X64-NEXT: imulq %rdi, %r12 +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r12, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: imulq %r10, %r14 +; X64-NEXT: addq %rdx, %r14 +; X64-NEXT: addq %r8, %r13 +; X64-NEXT: adcq %rbx, %r14 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r9, %r15 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %r8, %r12 +; X64-NEXT: adcq $0, %rbp +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r12, %rbx +; X64-NEXT: adcq %rbp, %r10 +; X64-NEXT: setb %r8b +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: addq %r10, %rax +; X64-NEXT: movzbl %r8b, %edi +; X64-NEXT: adcq %rdi, %rdx +; X64-NEXT: addq %r13, %rax +; X64-NEXT: adcq %r14, %rdx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NEXT: adcq %r11, %rbx +; X64-NEXT: adcq %r15, %rax +; X64-NEXT: adcq %rsi, %rdx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload ; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rbx +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: movq 80(%r13), %r8 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq 88(%r13), %r11 +; X64-NEXT: movq %r13, %r10 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rsi, %r9 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rbp, %rcx +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rcx, %rdi ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r13, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rdi, %r14 +; X64-NEXT: adcq %rsi, %rcx +; X64-NEXT: setb %dil +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r8, %r11 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: addq %rcx, %rsi +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: adcq %rax, %r13 +; X64-NEXT: movq %r10, %rdi +; X64-NEXT: movq 64(%r10), %r10 +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: adcq %rsi, %rdi -; X64-NEXT: setb %cl -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq 72(%rdi), %rax +; X64-NEXT: movq %rax, %r8 ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rdi, %r13 -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: addq %r12, %r10 -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r14, %rbp -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %r13 -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: addq %r11, %r13 -; X64-NEXT: adcq %r8, %r15 -; X64-NEXT: setb %r8b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: addq %rcx, %r12 +; X64-NEXT: adcq $0, %r15 ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %r11, %rcx -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq %r11, %r9 +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: addq %r12, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r15, %rcx +; X64-NEXT: setb %dil +; X64-NEXT: movq %r8, %r11 +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: adcq %rsi, %r14 -; X64-NEXT: setb %sil +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: adcq %rax, %r12 +; X64-NEXT: addq %rbx, %rbp +; X64-NEXT: adcq %r14, %r12 +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: adcq $0, %r13 +; X64-NEXT: movq %r10, %rdi +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, %rbx +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rcx, %r14 +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %r14, %rax -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: addq %r13, %r12 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: adcq %r10, %rcx +; X64-NEXT: setb %dil +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rcx, %r14 +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: adcq %rax, %r10 +; X64-NEXT: addq %rbp, %r9 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r12, %r11 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq $0, %r14 +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: addq %rsi, %r14 +; X64-NEXT: adcq %r13, %r10 +; X64-NEXT: setb %dil +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rcx, %r9 +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: addq %r9, %rax +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: adcq %rsi, %rcx +; X64-NEXT: setb %sil +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: movzbl %sil, %ecx +; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: addq %r14, %r12 ; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %rcx -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %r8b, %ecx +; X64-NEXT: adcq %r10, %r9 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %dil, %ecx ; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq 96(%rbp), %rsi -; X64-NEXT: imulq %rsi, %r9 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq 96(%rdi), %rsi +; X64-NEXT: imulq %rsi, %r15 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r9, %rdx -; X64-NEXT: movq 104(%rbp), %r15 -; X64-NEXT: imulq %r15, %rbx -; X64-NEXT: addq %rdx, %rbx -; X64-NEXT: movq %rbx, %r9 -; X64-NEXT: movq 112(%rbp), %rax -; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movq %r8, %rcx +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %r15, %rdx +; X64-NEXT: movq 104(%rdi), %r9 +; X64-NEXT: imulq %r9, %rcx +; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: movq %rcx, %r14 +; X64-NEXT: movq 112(%rdi), %rax ; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: imulq %rbx, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: imulq %r12, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: movq 120(%rdi), %r8 -; X64-NEXT: imulq %rbp, %r8 -; X64-NEXT: addq %rdx, %r8 -; X64-NEXT: addq %r14, %r13 -; X64-NEXT: adcq %r9, %r8 -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq 120(%rdi), %rdi +; X64-NEXT: imulq %r15, %rdi +; X64-NEXT: addq %rdx, %rdi +; X64-NEXT: addq %r10, %r8 +; X64-NEXT: adcq %r14, %rdi +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %r14, %rcx -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rsi, %rdi -; X64-NEXT: setb %cl -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rdi, %r15 -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %r13, %r15 -; X64-NEXT: adcq %r8, %r12 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: imulq %r11, %rcx -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r10, %r13 +; X64-NEXT: adcq $0, %r14 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %r13, %rbp +; X64-NEXT: adcq %r14, %rcx +; X64-NEXT: setb %sil +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rcx, %r14 +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: adcq %rax, %r10 +; X64-NEXT: addq %r8, %r14 +; X64-NEXT: adcq %rdi, %r10 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: imulq %r15, %rdi +; X64-NEXT: movq %r15, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: imulq %r14, %rsi +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %rdi, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: imulq %r12, %rsi ; X64-NEXT: addq %rdx, %rsi -; X64-NEXT: movq %rsi, %r9 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: imulq %rbp, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rsi, %r8 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: imulq %r9, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: imulq %rdi, %r10 -; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: addq %r8, %r13 -; X64-NEXT: adcq %r9, %r10 -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %r10 -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rsi, %rdi -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: imulq %r11, %rbx +; X64-NEXT: addq %rdx, %rbx +; X64-NEXT: addq %rcx, %r13 +; X64-NEXT: adcq %r8, %rbx +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %rdi, %rsi -; X64-NEXT: adcq %r9, %rcx -; X64-NEXT: setb %dil -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: addq %r8, %rsi +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rsi, %r11 +; X64-NEXT: adcq %rdi, %r8 +; X64-NEXT: setb %sil +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: addq %r8, %rax +; X64-NEXT: movzbl %sil, %esi +; X64-NEXT: adcq %rsi, %rdx ; X64-NEXT: addq %r13, %rax -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; X64-NEXT: adcq %r15, %rax -; X64-NEXT: adcq %r12, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: adcq %rbx, %rdx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: adcq %rbp, %r11 +; X64-NEXT: adcq %r14, %rax +; X64-NEXT: adcq %r10, %rdx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: movq (%rsp), %rdi # 8-byte Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: movq %rcx, %r9 +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: movq %rsi, %r8 ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload -; X64-NEXT: movq %rdi, %r10 +; X64-NEXT: movq %rdi, %r9 +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, (%rcx) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 8(%rcx) +; X64-NEXT: movq %rdi, (%rsi) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 16(%rcx) +; X64-NEXT: movq %rdi, 8(%rsi) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 24(%rcx) +; X64-NEXT: movq %rdi, 16(%rsi) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 32(%rcx) +; X64-NEXT: movq %rdi, 24(%rsi) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 40(%rcx) +; X64-NEXT: movq %rdi, 32(%rsi) +; X64-NEXT: movq (%rsp), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, 40(%rsi) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 48(%rcx) +; X64-NEXT: movq %rdi, 48(%rsi) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 56(%rcx) -; X64-NEXT: movq %r9, 64(%rcx) -; X64-NEXT: movq %r10, 72(%rcx) -; X64-NEXT: movq %rbx, 80(%rcx) -; X64-NEXT: movq %rbp, 88(%rcx) -; X64-NEXT: movq %r8, 96(%rcx) -; X64-NEXT: movq %rsi, 104(%rcx) -; X64-NEXT: movq %rax, 112(%rcx) -; X64-NEXT: movq %rdx, 120(%rcx) +; X64-NEXT: movq %rdi, 56(%rsi) +; X64-NEXT: movq %r8, 64(%rsi) +; X64-NEXT: movq %r9, 72(%rsi) +; X64-NEXT: movq %r10, 80(%rsi) +; X64-NEXT: movq %rbx, 88(%rsi) +; X64-NEXT: movq %rcx, 96(%rsi) +; X64-NEXT: movq %r11, 104(%rsi) +; X64-NEXT: movq %rax, 112(%rsi) +; X64-NEXT: movq %rdx, 120(%rsi) ; X64-NEXT: addq $240, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll --- a/llvm/test/CodeGen/X86/mul-i256.ll +++ b/llvm/test/CodeGen/X86/mul-i256.ll @@ -312,71 +312,66 @@ ; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: pushq %r14 ; X64-NEXT: .cfi_def_cfa_offset 24 -; X64-NEXT: pushq %r12 -; X64-NEXT: .cfi_def_cfa_offset 32 ; X64-NEXT: pushq %rbx -; X64-NEXT: .cfi_def_cfa_offset 40 -; X64-NEXT: .cfi_offset %rbx, -40 -; X64-NEXT: .cfi_offset %r12, -32 +; X64-NEXT: .cfi_def_cfa_offset 32 +; X64-NEXT: .cfi_offset %rbx, -32 ; X64-NEXT: .cfi_offset %r14, -24 ; X64-NEXT: .cfi_offset %r15, -16 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq (%rdi), %r14 -; X64-NEXT: movq 8(%rdi), %r8 -; X64-NEXT: movq 16(%rdi), %rcx -; X64-NEXT: movq 16(%rsi), %rbx -; X64-NEXT: movq (%rsi), %r12 -; X64-NEXT: movq 8(%rsi), %r15 -; X64-NEXT: movq 24(%rdi), %rdi -; X64-NEXT: imulq %r12, %rdi -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: imulq %r15, %rcx -; X64-NEXT: addq %rdx, %rcx -; X64-NEXT: movq %rbx, %rdi -; X64-NEXT: imulq %r8, %rdi +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq (%rdi), %rbx +; X64-NEXT: movq 8(%rdi), %r11 +; X64-NEXT: movq 16(%rdi), %r10 +; X64-NEXT: movq 16(%rsi), %r8 +; X64-NEXT: movq (%rsi), %r9 +; X64-NEXT: movq 8(%rsi), %r14 +; X64-NEXT: movq 24(%rdi), %r15 +; X64-NEXT: imulq %r9, %r15 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r15, %rdx +; X64-NEXT: imulq %r14, %r10 +; X64-NEXT: addq %rdx, %r10 +; X64-NEXT: movq %r8, %r15 +; X64-NEXT: imulq %r11, %r15 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %r15, %rdx +; X64-NEXT: movq 24(%rsi), %r15 +; X64-NEXT: imulq %rbx, %r15 +; X64-NEXT: addq %rdx, %r15 +; X64-NEXT: addq %rdi, %r8 +; X64-NEXT: adcq %r10, %r15 ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: movq 24(%rsi), %rbx -; X64-NEXT: imulq %r14, %rbx -; X64-NEXT: addq %rdx, %rbx -; X64-NEXT: addq %r10, %r11 -; X64-NEXT: adcq %rcx, %rbx -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rsi, %rdi -; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rsi, %r9 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rdi, %r14 -; X64-NEXT: adcq %rcx, %rsi +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r9, %rbx +; X64-NEXT: adcq %rdi, %rsi ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %ecx -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: movzbl %al, %edi +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: addq %rsi, %rax -; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %r11, %rax -; X64-NEXT: adcq %rbx, %rdx -; X64-NEXT: movq %r10, (%r9) -; X64-NEXT: movq %r14, 8(%r9) -; X64-NEXT: movq %rax, 16(%r9) -; X64-NEXT: movq %rdx, 24(%r9) +; X64-NEXT: adcq %rdi, %rdx +; X64-NEXT: addq %r8, %rax +; X64-NEXT: adcq %r15, %rdx +; X64-NEXT: movq %r10, (%rcx) +; X64-NEXT: movq %rbx, 8(%rcx) +; X64-NEXT: movq %rax, 16(%rcx) +; X64-NEXT: movq %rdx, 24(%rcx) ; X64-NEXT: popq %rbx -; X64-NEXT: .cfi_def_cfa_offset 32 -; X64-NEXT: popq %r12 ; X64-NEXT: .cfi_def_cfa_offset 24 ; X64-NEXT: popq %r14 ; X64-NEXT: .cfi_def_cfa_offset 16 diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll --- a/llvm/test/CodeGen/X86/mul-i512.ll +++ b/llvm/test/CodeGen/X86/mul-i512.ll @@ -1179,267 +1179,271 @@ ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: pushq %rax +; X64-NEXT: movq %rdx, (%rsp) # 8-byte Spill ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq (%rdi), %r9 -; X64-NEXT: movq 8(%rdi), %r8 -; X64-NEXT: movq 24(%rdi), %r15 -; X64-NEXT: movq 16(%rdi), %rax -; X64-NEXT: movq (%rsi), %rdi -; X64-NEXT: movq 8(%rsi), %r14 -; X64-NEXT: movq %rsi, %r12 -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: movq (%rdi), %rbx +; X64-NEXT: movq 8(%rdi), %r9 +; X64-NEXT: movq 24(%rdi), %r12 +; X64-NEXT: movq 16(%rdi), %r14 +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: movq 8(%rsi), %r11 +; X64-NEXT: movq %rsi, %rdi +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rcx, %rsi +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rsi, %r15 +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rbx, %r10 -; X64-NEXT: adcq %rbp, %rcx +; X64-NEXT: addq %rcx, %r10 +; X64-NEXT: adcq $0, %r8 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %r10, %rcx +; X64-NEXT: adcq %r8, %r14 ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %esi -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %r14, %r15 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: adcq %rsi, %rdx +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %r14, %r10 +; X64-NEXT: adcq %rsi, %r13 +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: adcq $0, %rbx ; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r9, %rdi ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %rbp, %rax +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r14, %r15 +; X64-NEXT: adcq $0, %rbp +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rbx, %r12 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: addq %r15, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbx, %rcx +; X64-NEXT: adcq %rbp, %rbx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %rbx, %rbp ; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: addq %r11, %rbx -; X64-NEXT: adcq %r10, %r15 -; X64-NEXT: adcq $0, %r14 +; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; X64-NEXT: adcq %rcx, %r14 +; X64-NEXT: adcq $0, %r10 ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq 16(%r12), %r9 -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %rdi, %rsi ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, %r11 -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r9 +; X64-NEXT: movq 16(%rdi), %r8 +; X64-NEXT: movq %r12, %r11 +; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r10, %rbp -; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq 24(%r12), %r8 -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r9, %r12 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: addq %rbp, %rax -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: adcq %rcx, %rsi -; X64-NEXT: setb %cl +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %rcx, %r15 +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq 24(%rsi), %rsi ; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r15, %r11 +; X64-NEXT: adcq %rbx, %r9 +; X64-NEXT: setb %bl +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %r9, %rcx +; X64-NEXT: movzbl %bl, %eax +; X64-NEXT: adcq %rax, %r15 +; X64-NEXT: addq %rbp, %rdi +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r14, %r11 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: adcq $0, %r15 +; X64-NEXT: addq %r10, %rcx +; X64-NEXT: adcq %r13, %r15 +; X64-NEXT: setb %r12b +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rsi, %r12 -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: adcq %rax, %r11 -; X64-NEXT: addq %rbx, %r13 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %rbp -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %r12 -; X64-NEXT: adcq $0, %r11 -; X64-NEXT: addq %r14, %r12 -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; X64-NEXT: setb %r15b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %rbp, %rax -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: adcq %rsi, %rcx -; X64-NEXT: setb %bl +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: adcq %r9, %rbp +; X64-NEXT: setb %dil ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: movzbl %bl, %eax -; X64-NEXT: adcq %rax, %rdx -; X64-NEXT: addq %r12, %rdi -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r11, %rbp -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %r15b, %eax -; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: mulq %rsi +; X64-NEXT: addq %rbp, %rax +; X64-NEXT: movzbl %dil, %edi +; X64-NEXT: adcq %rdi, %rdx +; X64-NEXT: addq %rcx, %r11 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r15, %rbx +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %r12b, %ecx +; X64-NEXT: adcq %rcx, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq 32(%rcx), %r12 -; X64-NEXT: imulq %r12, %r8 -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r8, %rdx -; X64-NEXT: movq 40(%rcx), %r8 -; X64-NEXT: imulq %r8, %r9 -; X64-NEXT: addq %rdx, %r9 +; X64-NEXT: movq 32(%rcx), %r15 +; X64-NEXT: imulq %r15, %rsi +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rsi, %rdx +; X64-NEXT: movq 40(%rcx), %rsi +; X64-NEXT: imulq %rsi, %r8 +; X64-NEXT: addq %rdx, %r8 ; X64-NEXT: movq 48(%rcx), %rax -; X64-NEXT: movq %rcx, %rbp +; X64-NEXT: movq %rcx, %r11 ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: imulq %r15, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: imulq %r14, %rdi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: movq 56(%rbp), %rbp -; X64-NEXT: imulq %rbx, %rbp -; X64-NEXT: addq %rdx, %rbp -; X64-NEXT: addq %r11, %rcx -; X64-NEXT: adcq %r9, %rbp +; X64-NEXT: movq 56(%r11), %r11 +; X64-NEXT: imulq %rbx, %r11 +; X64-NEXT: addq %rdx, %r11 +; X64-NEXT: addq %r9, %rcx +; X64-NEXT: adcq %r8, %r11 ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %rbx, %r11 -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rbx, %r8 +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r9, %rbx -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rbx, %r12 -; X64-NEXT: adcq %rdi, %rsi -; X64-NEXT: setb %bl -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rsi, %r13 -; X64-NEXT: movzbl %bl, %eax -; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: addq %rcx, %r13 -; X64-NEXT: adcq %rbp, %r15 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq 56(%rdx), %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq 48(%rdx), %rbx -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: imulq %r9, %rbx -; X64-NEXT: addq %rdx, %rbx -; X64-NEXT: movq 32(%r8), %rdi -; X64-NEXT: movq 40(%r8), %r8 +; X64-NEXT: addq %rbx, %r13 +; X64-NEXT: adcq %r9, %r15 +; X64-NEXT: setb %dil +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %r15, %r8 +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: adcq %rax, %r12 +; X64-NEXT: addq %rcx, %r8 +; X64-NEXT: adcq %r11, %r12 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq 56(%rcx), %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: imulq %rax, %rsi +; X64-NEXT: movq 48(%rcx), %r11 +; X64-NEXT: movq %rcx, %rdi +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rax, %rcx -; X64-NEXT: imulq %r8, %rcx -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: imulq %rdi, %r10 -; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: addq %rsi, %r11 -; X64-NEXT: adcq %rbx, %r10 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: addq %rsi, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: imulq %r14, %r11 +; X64-NEXT: addq %rdx, %r11 +; X64-NEXT: movq 32(%rdi), %r9 +; X64-NEXT: movq 40(%rdi), %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rbx, %rbp -; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: imulq %r15, %rsi ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rbp, %rdi -; X64-NEXT: adcq %rcx, %rbx -; X64-NEXT: setb %cl -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: addq %rbx, %rax -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %r11, %rax +; X64-NEXT: addq %rsi, %rdx +; X64-NEXT: imulq %r9, %r10 +; X64-NEXT: addq %rdx, %r10 +; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: adcq %r11, %r10 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rsi, %rbx +; X64-NEXT: adcq $0, %rbp +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rbx, %r9 +; X64-NEXT: adcq %rbp, %rsi +; X64-NEXT: setb %bl +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: addq %rsi, %rax +; X64-NEXT: movzbl %bl, %esi +; X64-NEXT: adcq %rsi, %rdx +; X64-NEXT: addq %rdi, %rax ; X64-NEXT: adcq %r10, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; X64-NEXT: adcq %r12, %rdi -; X64-NEXT: adcq %r13, %rax -; X64-NEXT: adcq %r15, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload -; X64-NEXT: adcq %r14, %rax +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: adcq %r13, %r9 +; X64-NEXT: adcq %r8, %rax +; X64-NEXT: adcq %r12, %rdx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq %rbp, (%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq %rbp, 8(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq %rbp, 16(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq %rbp, 24(%rcx) -; X64-NEXT: movq %rsi, 32(%rcx) -; X64-NEXT: movq %rdi, 40(%rcx) -; X64-NEXT: movq %rax, 48(%rcx) -; X64-NEXT: movq %rdx, 56(%rcx) +; X64-NEXT: movq (%rsp), %rsi # 8-byte Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, (%rsi) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, 8(%rsi) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, 16(%rsi) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, 24(%rsi) +; X64-NEXT: movq %rcx, 32(%rsi) +; X64-NEXT: movq %r9, 40(%rsi) +; X64-NEXT: movq %rax, 48(%rsi) +; X64-NEXT: movq %rdx, 56(%rsi) +; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll --- a/llvm/test/CodeGen/X86/muloti.ll +++ b/llvm/test/CodeGen/X86/muloti.ll @@ -7,64 +7,61 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nounwind uwtable ssp { ; CHECK-LABEL: x: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset %rbx, -32 -; CHECK-NEXT: .cfi_offset %r14, -24 -; CHECK-NEXT: .cfi_offset %r15, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 ; CHECK-NEXT: movq %rdx, %r11 -; CHECK-NEXT: movq %rsi, %r9 -; CHECK-NEXT: movq %rdi, %r15 -; CHECK-NEXT: sarq $63, %rsi +; CHECK-NEXT: movq %rdi, %r9 +; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: sarq $63, %rbx ; CHECK-NEXT: movq %rdx, %rdi -; CHECK-NEXT: imulq %rsi, %rdi +; CHECK-NEXT: imulq %rbx, %rdi ; CHECK-NEXT: movq %rdx, %rax -; CHECK-NEXT: mulq %rsi +; CHECK-NEXT: mulq %rbx ; CHECK-NEXT: movq %rax, %r8 ; CHECK-NEXT: addq %rdi, %rdx -; CHECK-NEXT: imulq %rcx, %rsi -; CHECK-NEXT: addq %rdx, %rsi +; CHECK-NEXT: imulq %rcx, %rbx +; CHECK-NEXT: addq %rdx, %rbx ; CHECK-NEXT: movq %rcx, %rdi ; CHECK-NEXT: sarq $63, %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: imulq %r9, %rbx +; CHECK-NEXT: movq %rdi, %r14 +; CHECK-NEXT: imulq %rsi, %r14 ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: mulq %r15 +; CHECK-NEXT: mulq %r9 ; CHECK-NEXT: movq %rax, %r10 -; CHECK-NEXT: addq %rbx, %rdx -; CHECK-NEXT: imulq %r15, %rdi +; CHECK-NEXT: addq %r14, %rdx +; CHECK-NEXT: imulq %r9, %rdi ; CHECK-NEXT: addq %rdx, %rdi ; CHECK-NEXT: addq %r8, %r10 -; CHECK-NEXT: adcq %rsi, %rdi -; CHECK-NEXT: movq %r15, %rax -; CHECK-NEXT: mulq %r11 -; CHECK-NEXT: movq %rdx, %r14 -; CHECK-NEXT: movq %rax, %r8 +; CHECK-NEXT: adcq %rbx, %rdi ; CHECK-NEXT: movq %r9, %rax ; CHECK-NEXT: mulq %r11 ; CHECK-NEXT: movq %rdx, %rbx -; CHECK-NEXT: movq %rax, %rsi -; CHECK-NEXT: addq %r14, %rsi -; CHECK-NEXT: adcq $0, %rbx -; CHECK-NEXT: movq %r15, %rax +; CHECK-NEXT: movq %rax, %r8 +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: mulq %r11 +; CHECK-NEXT: movq %rdx, %r11 +; CHECK-NEXT: movq %rax, %r14 +; CHECK-NEXT: addq %rbx, %r14 +; CHECK-NEXT: adcq $0, %r11 +; CHECK-NEXT: movq %r9, %rax ; CHECK-NEXT: mulq %rcx -; CHECK-NEXT: movq %rdx, %r14 -; CHECK-NEXT: movq %rax, %r11 -; CHECK-NEXT: addq %rsi, %r11 -; CHECK-NEXT: adcq %rbx, %r14 +; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: movq %rax, %r9 +; CHECK-NEXT: addq %r14, %r9 +; CHECK-NEXT: adcq %r11, %rbx ; CHECK-NEXT: setb %al -; CHECK-NEXT: movzbl %al, %esi -; CHECK-NEXT: movq %r9, %rax +; CHECK-NEXT: movzbl %al, %r11d +; CHECK-NEXT: movq %rsi, %rax ; CHECK-NEXT: mulq %rcx -; CHECK-NEXT: addq %r14, %rax -; CHECK-NEXT: adcq %rsi, %rdx +; CHECK-NEXT: addq %rbx, %rax +; CHECK-NEXT: adcq %r11, %rdx ; CHECK-NEXT: addq %r10, %rax ; CHECK-NEXT: adcq %rdi, %rdx -; CHECK-NEXT: movq %r11, %rcx +; CHECK-NEXT: movq %r9, %rcx ; CHECK-NEXT: sarq $63, %rcx ; CHECK-NEXT: xorq %rcx, %rdx ; CHECK-NEXT: xorq %rax, %rcx @@ -72,10 +69,9 @@ ; CHECK-NEXT: jne LBB0_1 ; CHECK-NEXT: ## %bb.2: ## %nooverflow ; CHECK-NEXT: movq %r8, %rax -; CHECK-NEXT: movq %r11, %rdx +; CHECK-NEXT: movq %r9, %rdx ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 ; CHECK-NEXT: retq ; CHECK-NEXT: LBB0_1: ## %overflow ; CHECK-NEXT: ud2 diff --git a/llvm/test/CodeGen/X86/musttail-varargs.ll b/llvm/test/CodeGen/X86/musttail-varargs.ll --- a/llvm/test/CodeGen/X86/musttail-varargs.ll +++ b/llvm/test/CodeGen/X86/musttail-varargs.ll @@ -46,12 +46,12 @@ ; LINUX-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; LINUX-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; LINUX-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; LINUX-NEXT: movq %r9, %r15 -; LINUX-NEXT: movq %r8, %r12 -; LINUX-NEXT: movq %rcx, %r13 -; LINUX-NEXT: movq %rdx, %rbp -; LINUX-NEXT: movq %rsi, %rbx -; LINUX-NEXT: movq %rdi, %r14 +; LINUX-NEXT: movq %r9, %r14 +; LINUX-NEXT: movq %r8, %r15 +; LINUX-NEXT: movq %rcx, %r12 +; LINUX-NEXT: movq %rdx, %r13 +; LINUX-NEXT: movq %rsi, %rbp +; LINUX-NEXT: movq %rdi, %rbx ; LINUX-NEXT: movq %rsi, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movq %rcx, {{[0-9]+}}(%rsp) @@ -77,13 +77,13 @@ ; LINUX-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; LINUX-NEXT: callq get_f@PLT ; LINUX-NEXT: movq %rax, %r11 -; LINUX-NEXT: movq %r14, %rdi -; LINUX-NEXT: movq %rbx, %rsi -; LINUX-NEXT: movq %rbp, %rdx -; LINUX-NEXT: movq %r13, %rcx -; LINUX-NEXT: movq %r12, %r8 +; LINUX-NEXT: movq %rbx, %rdi +; LINUX-NEXT: movq %rbp, %rsi +; LINUX-NEXT: movq %r13, %rdx +; LINUX-NEXT: movq %r12, %rcx +; LINUX-NEXT: movq %r15, %r8 ; LINUX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; LINUX-NEXT: movq %r15, %r9 +; LINUX-NEXT: movq %r14, %r9 ; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -139,12 +139,12 @@ ; LINUX-X32-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; LINUX-X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; LINUX-X32-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; LINUX-X32-NEXT: movq %r9, %r15 -; LINUX-X32-NEXT: movq %r8, %r12 -; LINUX-X32-NEXT: movq %rcx, %r13 -; LINUX-X32-NEXT: movq %rdx, %rbp -; LINUX-X32-NEXT: movq %rsi, %rbx -; LINUX-X32-NEXT: movq %rdi, %r14 +; LINUX-X32-NEXT: movq %r9, %r14 +; LINUX-X32-NEXT: movq %r8, %r15 +; LINUX-X32-NEXT: movq %rcx, %r12 +; LINUX-X32-NEXT: movq %rdx, %r13 +; LINUX-X32-NEXT: movq %rsi, %rbp +; LINUX-X32-NEXT: movq %rdi, %rbx ; LINUX-X32-NEXT: movq %rsi, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movq %rdx, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movq %rcx, {{[0-9]+}}(%esp) @@ -170,13 +170,13 @@ ; LINUX-X32-NEXT: movq %rax, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: callq get_f@PLT ; LINUX-X32-NEXT: movl %eax, %r11d -; LINUX-X32-NEXT: movq %r14, %rdi -; LINUX-X32-NEXT: movq %rbx, %rsi -; LINUX-X32-NEXT: movq %rbp, %rdx -; LINUX-X32-NEXT: movq %r13, %rcx -; LINUX-X32-NEXT: movq %r12, %r8 +; LINUX-X32-NEXT: movq %rbx, %rdi +; LINUX-X32-NEXT: movq %rbp, %rsi +; LINUX-X32-NEXT: movq %r13, %rdx +; LINUX-X32-NEXT: movq %r12, %rcx +; LINUX-X32-NEXT: movq %r15, %r8 ; LINUX-X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; LINUX-X32-NEXT: movq %r15, %r9 +; LINUX-X32-NEXT: movq %r14, %r9 ; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload ; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload @@ -214,20 +214,20 @@ ; WINDOWS-NEXT: subq $72, %rsp ; WINDOWS-NEXT: .seh_stackalloc 72 ; WINDOWS-NEXT: .seh_endprologue -; WINDOWS-NEXT: movq %r9, %r14 +; WINDOWS-NEXT: movq %r9, %rsi ; WINDOWS-NEXT: movq %r8, %rdi ; WINDOWS-NEXT: movq %rdx, %rbx -; WINDOWS-NEXT: movq %rcx, %rsi +; WINDOWS-NEXT: movq %rcx, %r14 ; WINDOWS-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: movq %r8, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: movq %r9, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; WINDOWS-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: callq get_f -; WINDOWS-NEXT: movq %rsi, %rcx +; WINDOWS-NEXT: movq %r14, %rcx ; WINDOWS-NEXT: movq %rbx, %rdx ; WINDOWS-NEXT: movq %rdi, %r8 -; WINDOWS-NEXT: movq %r14, %r9 +; WINDOWS-NEXT: movq %rsi, %r9 ; WINDOWS-NEXT: addq $72, %rsp ; WINDOWS-NEXT: popq %rbx ; WINDOWS-NEXT: popq %rdi diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll --- a/llvm/test/CodeGen/X86/nontemporal-loads.ll +++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll @@ -1781,29 +1781,26 @@ ; ; SSE41-LABEL: test_masked_v16i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm7, %xmm9 -; SSE41-NEXT: movdqa %xmm6, %xmm10 -; SSE41-NEXT: movdqa %xmm5, %xmm11 ; SSE41-NEXT: movdqa %xmm0, %xmm8 ; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm4 -; SSE41-NEXT: movntdqa 32(%rdi), %xmm7 -; SSE41-NEXT: movntdqa 16(%rdi), %xmm6 -; SSE41-NEXT: movntdqa (%rdi), %xmm5 -; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm11, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm7 -; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: movntdqa 32(%rdi), %xmm9 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm10 +; SSE41-NEXT: movntdqa (%rdi), %xmm11 +; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm11 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm10 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm4 -; SSE41-NEXT: movaps %xmm5, %xmm0 -; SSE41-NEXT: movaps %xmm6, %xmm1 -; SSE41-NEXT: movaps %xmm7, %xmm2 +; SSE41-NEXT: movaps %xmm11, %xmm0 +; SSE41-NEXT: movaps %xmm10, %xmm1 +; SSE41-NEXT: movaps %xmm9, %xmm2 ; SSE41-NEXT: movaps %xmm4, %xmm3 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -910,7 +910,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqu (%rdi), %xmm3 ; SSE2-NEXT: movdqu 16(%rdi), %xmm2 -; SSE2-NEXT: movdqu 32(%rdi), %xmm8 +; SSE2-NEXT: movdqu 32(%rdi), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm1, %xmm4 @@ -921,7 +921,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,2,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,1] ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] @@ -938,12 +938,12 @@ ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] ; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,3,2,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,3,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] -; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pandn %xmm7, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm5, %xmm8 ; SSE2-NEXT: pand %xmm4, %xmm2 ; SSE2-NEXT: pandn %xmm3, %xmm4 ; SSE2-NEXT: por %xmm2, %xmm4 @@ -952,12 +952,12 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] ; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE2-NEXT: pandn %xmm0, %xmm6 ; SSE2-NEXT: por %xmm2, %xmm6 ; SSE2-NEXT: movups %xmm1, (%rsi) -; SSE2-NEXT: movdqu %xmm0, (%rdx) +; SSE2-NEXT: movdqu %xmm8, (%rdx) ; SSE2-NEXT: movdqu %xmm6, (%rcx) ; SSE2-NEXT: retq ; @@ -1057,7 +1057,7 @@ define void @interleave_24i16_out_reverse(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2-LABEL: interleave_24i16_out_reverse: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqu (%rdi), %xmm8 +; SSE2-NEXT: movdqu (%rdi), %xmm0 ; SSE2-NEXT: movdqu 16(%rdi), %xmm1 ; SSE2-NEXT: movdqu 32(%rdi), %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0] @@ -1069,7 +1069,7 @@ ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,5,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,2,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,1] ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,6] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] @@ -1086,12 +1086,12 @@ ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] ; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,7,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] -; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pandn %xmm7, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm5, %xmm8 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm4 ; SSE2-NEXT: por %xmm1, %xmm4 @@ -1101,13 +1101,13 @@ ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] -; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] +; SSE2-NEXT: pandn %xmm0, %xmm6 ; SSE2-NEXT: por %xmm1, %xmm6 ; SSE2-NEXT: movups %xmm2, (%rsi) -; SSE2-NEXT: movdqu %xmm0, (%rdx) +; SSE2-NEXT: movdqu %xmm8, (%rdx) ; SSE2-NEXT: movdqu %xmm6, (%rcx) ; SSE2-NEXT: retq ; @@ -1393,75 +1393,75 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2-LABEL: interleave_24i32_out: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqu 64(%rdi), %xmm9 -; SSE2-NEXT: movups 80(%rdi), %xmm8 +; SSE2-NEXT: movdqu 64(%rdi), %xmm1 +; SSE2-NEXT: movups 80(%rdi), %xmm4 ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu 16(%rdi), %xmm10 +; SSE2-NEXT: movdqu 16(%rdi), %xmm2 ; SSE2-NEXT: movups 32(%rdi), %xmm5 ; SSE2-NEXT: movdqu 48(%rdi), %xmm3 ; SSE2-NEXT: movaps %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,1,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm10[2,0] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm5[2,0] -; SSE2-NEXT: movaps %xmm8, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm9[2,0] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm8[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm9[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[2,0] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm5[2,0] +; SSE2-NEXT: movaps %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[2,0] +; SSE2-NEXT: movdqa %xmm3, %xmm10 +; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm4[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm1[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm10[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm2[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0] -; SSE2-NEXT: movups %xmm2, 16(%rsi) -; SSE2-NEXT: movups %xmm4, (%rsi) +; SSE2-NEXT: movups %xmm10, 16(%rsi) +; SSE2-NEXT: movups %xmm8, (%rsi) ; SSE2-NEXT: movups %xmm3, 16(%rdx) ; SSE2-NEXT: movups %xmm0, (%rdx) -; SSE2-NEXT: movups %xmm1, 16(%rcx) +; SSE2-NEXT: movups %xmm9, 16(%rcx) ; SSE2-NEXT: movups %xmm7, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_out: ; SSE42: # %bb.0: -; SSE42-NEXT: movups 80(%rdi), %xmm8 -; SSE42-NEXT: movdqu 64(%rdi), %xmm9 +; SSE42-NEXT: movups 80(%rdi), %xmm0 +; SSE42-NEXT: movdqu 64(%rdi), %xmm1 ; SSE42-NEXT: movdqu (%rdi), %xmm3 ; SSE42-NEXT: movdqu 16(%rdi), %xmm2 -; SSE42-NEXT: movups 32(%rdi), %xmm10 +; SSE42-NEXT: movups 32(%rdi), %xmm4 ; SSE42-NEXT: movdqu 48(%rdi), %xmm5 ; SSE42-NEXT: movdqa %xmm2, %xmm6 ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] ; SSE42-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[2,3] -; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[1] -; SSE42-NEXT: movdqa %xmm9, %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm9[2,3] -; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[1] -; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,2,2,2] +; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[1] +; SSE42-NEXT: movdqa %xmm1, %xmm8 +; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] +; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm1[2,3] +; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm0[1] +; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,2,2,2] ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm4[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2] -; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm4[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm10[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,0,3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] +; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5],xmm10[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[0,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3],xmm0[4,5,6,7] -; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[0,3] +; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,3],xmm9[4,5,6,7] +; SSE42-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,3] ; SSE42-NEXT: movups %xmm5, 16(%rsi) ; SSE42-NEXT: movups %xmm3, (%rsi) -; SSE42-NEXT: movdqu %xmm4, 16(%rdx) +; SSE42-NEXT: movdqu %xmm10, 16(%rdx) ; SSE42-NEXT: movdqu %xmm6, (%rdx) -; SSE42-NEXT: movups %xmm0, 16(%rcx) +; SSE42-NEXT: movups %xmm9, 16(%rcx) ; SSE42-NEXT: movups %xmm7, (%rcx) ; SSE42-NEXT: retq ; @@ -1633,35 +1633,35 @@ ; SSE2-NEXT: movups 16(%rsi), %xmm0 ; SSE2-NEXT: movups (%rdx), %xmm2 ; SSE2-NEXT: movups 16(%rdx), %xmm5 -; SSE2-NEXT: movups (%rcx), %xmm8 -; SSE2-NEXT: movups 16(%rcx), %xmm9 -; SSE2-NEXT: movaps %xmm8, %xmm7 +; SSE2-NEXT: movups (%rcx), %xmm4 +; SSE2-NEXT: movups 16(%rcx), %xmm6 +; SSE2-NEXT: movaps %xmm4, %xmm7 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[1,3] ; SSE2-NEXT: movaps %xmm1, %xmm3 ; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[0,2] ; SSE2-NEXT: movaps %xmm0, %xmm7 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] -; SSE2-NEXT: movaps %xmm9, %xmm6 -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[1,3] -; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: movaps %xmm6, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[1,3] +; SSE2-NEXT: movaps %xmm0, %xmm9 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm9[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm5[1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm7[0,2] -; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm6[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm5[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm7[0,2] +; SSE2-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[0,2] ; SSE2-NEXT: movaps %xmm1, %xmm5 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm8[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE2-NEXT: movups %xmm8, 16(%rdi) -; SSE2-NEXT: movups %xmm4, 48(%rdi) -; SSE2-NEXT: movups %xmm9, 64(%rdi) +; SSE2-NEXT: movups %xmm4, 16(%rdi) +; SSE2-NEXT: movups %xmm9, 48(%rdi) +; SSE2-NEXT: movups %xmm6, 64(%rdi) ; SSE2-NEXT: movups %xmm3, (%rdi) ; SSE2-NEXT: movups %xmm1, 32(%rdi) ; SSE2-NEXT: movups %xmm0, 80(%rdi) @@ -1671,38 +1671,38 @@ ; SSE42: # %bb.0: ; SSE42-NEXT: movdqu (%rsi), %xmm0 ; SSE42-NEXT: movdqu 16(%rsi), %xmm4 -; SSE42-NEXT: movdqu (%rdx), %xmm9 +; SSE42-NEXT: movdqu (%rdx), %xmm2 ; SSE42-NEXT: movdqu 16(%rdx), %xmm5 ; SSE42-NEXT: movdqu (%rcx), %xmm3 ; SSE42-NEXT: movdqu 16(%rcx), %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5],xmm7[6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm4[4,5],xmm7[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,2,2] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4,5],xmm9[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,2,2] +; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm0[4,5],xmm9[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm3[2,3],xmm9[4,5,6,7] ; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm5[3,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] -; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm9[3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3,4,5],xmm3[6,7] -; SSE42-NEXT: movdqu %xmm3, 32(%rdi) +; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] +; SSE42-NEXT: movdqu %xmm2, 32(%rdi) ; SSE42-NEXT: movdqu %xmm5, 80(%rdi) -; SSE42-NEXT: movdqu %xmm2, 16(%rdi) -; SSE42-NEXT: movdqu %xmm1, 48(%rdi) +; SSE42-NEXT: movdqu %xmm9, 16(%rdi) +; SSE42-NEXT: movdqu %xmm8, 48(%rdi) ; SSE42-NEXT: movdqu %xmm7, 64(%rdi) -; SSE42-NEXT: movdqu %xmm8, (%rdi) +; SSE42-NEXT: movdqu %xmm1, (%rdi) ; SSE42-NEXT: retq ; ; AVX1-LABEL: interleave_24i32_in: @@ -2009,19 +2009,19 @@ ; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] ; XOP-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; XOP-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; XOP-NEXT: vpalignr {{.*#+}} xmm8 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] ; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10] -; XOP-NEXT: vpperm %xmm3, %xmm4, %xmm2, %xmm2 -; XOP-NEXT: vpperm %xmm3, %xmm0, %xmm7, %xmm0 -; XOP-NEXT: vpperm %xmm3, %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpperm %xmm3, %xmm1, %xmm6, %xmm1 -; XOP-NEXT: vpperm %xmm3, %xmm5, %xmm8, %xmm7 -; XOP-NEXT: vpperm %xmm3, %xmm6, %xmm5, %xmm3 -; XOP-NEXT: vmovdqa %xmm3, 80(%rdi) -; XOP-NEXT: vmovdqa %xmm7, 64(%rdi) +; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10] +; XOP-NEXT: vpperm %xmm8, %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpperm %xmm8, %xmm0, %xmm7, %xmm0 +; XOP-NEXT: vpperm %xmm8, %xmm7, %xmm4, %xmm4 +; XOP-NEXT: vpperm %xmm8, %xmm1, %xmm6, %xmm1 +; XOP-NEXT: vpperm %xmm8, %xmm5, %xmm3, %xmm3 +; XOP-NEXT: vpperm %xmm8, %xmm6, %xmm5, %xmm5 +; XOP-NEXT: vmovdqa %xmm5, 80(%rdi) +; XOP-NEXT: vmovdqa %xmm3, 64(%rdi) ; XOP-NEXT: vmovdqa %xmm1, 48(%rdi) ; XOP-NEXT: vmovdqa %xmm4, 32(%rdi) ; XOP-NEXT: vmovdqa %xmm2, 16(%rdi) @@ -2181,19 +2181,19 @@ ; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] ; XOP-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; XOP-NEXT: vpalignr {{.*#+}} xmm7 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; XOP-NEXT: vpalignr {{.*#+}} xmm8 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; XOP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] ; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10] -; XOP-NEXT: vpperm %xmm3, %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpperm %xmm3, %xmm0, %xmm7, %xmm0 -; XOP-NEXT: vpperm %xmm3, %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpperm %xmm3, %xmm2, %xmm6, %xmm2 -; XOP-NEXT: vpperm %xmm3, %xmm5, %xmm8, %xmm7 -; XOP-NEXT: vpperm %xmm3, %xmm6, %xmm5, %xmm3 -; XOP-NEXT: vmovdqa %xmm3, 80(%rdi) -; XOP-NEXT: vmovdqa %xmm7, 64(%rdi) +; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10] +; XOP-NEXT: vpperm %xmm8, %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpperm %xmm8, %xmm0, %xmm7, %xmm0 +; XOP-NEXT: vpperm %xmm8, %xmm7, %xmm4, %xmm4 +; XOP-NEXT: vpperm %xmm8, %xmm2, %xmm6, %xmm2 +; XOP-NEXT: vpperm %xmm8, %xmm5, %xmm3, %xmm3 +; XOP-NEXT: vpperm %xmm8, %xmm6, %xmm5, %xmm5 +; XOP-NEXT: vmovdqa %xmm5, 80(%rdi) +; XOP-NEXT: vmovdqa %xmm3, 64(%rdi) ; XOP-NEXT: vmovdqa %xmm2, 48(%rdi) ; XOP-NEXT: vmovdqa %xmm4, 32(%rdi) ; XOP-NEXT: vmovdqa %xmm1, 16(%rdi) diff --git a/llvm/test/CodeGen/X86/or-address.ll b/llvm/test/CodeGen/X86/or-address.ll --- a/llvm/test/CodeGen/X86/or-address.ll +++ b/llvm/test/CodeGen/X86/or-address.ll @@ -47,10 +47,10 @@ } ; CHECK-LABEL: test1: -; CHECK: movl %{{.*}}, (%[[RDI:...]],%[[RCX:...]],4) -; CHECK: movl %{{.*}}, 8(%[[RDI]],%[[RCX]],4) -; CHECK: movl %{{.*}}, 4(%[[RDI]],%[[RCX]],4) -; CHECK: movl %{{.*}}, 12(%[[RDI]],%[[RCX]],4) +; CHECK: movl %{{.*}}, (%[[BASE:r.*]],%[[INDEX:r.*]],4) +; CHECK: movl %{{.*}}, 8(%[[BASE]],%[[INDEX]],4) +; CHECK: movl %{{.*}}, 4(%[[BASE]],%[[INDEX]],4) +; CHECK: movl %{{.*}}, 12(%[[BASE]],%[[INDEX]],4) define void @test1(ptr nocapture %array, i32 %r0, i8 signext %k, i8 signext %i0) nounwind { bb.nph: diff --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll --- a/llvm/test/CodeGen/X86/paddus.ll +++ b/llvm/test/CodeGen/X86/paddus.ll @@ -397,23 +397,23 @@ define <64 x i8> @test13(<64 x i8> %x) { ; SSE-LABEL: test13: ; SSE: # %bb.0: -; SSE-NEXT: pcmpeqd %xmm8, %xmm8 +; SSE-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psubb %xmm8, %xmm5 +; SSE-NEXT: psubb %xmm4, %xmm5 ; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: psubb %xmm8, %xmm6 +; SSE-NEXT: psubb %xmm4, %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: psubb %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: psubb %xmm8, %xmm4 -; SSE-NEXT: pcmpeqb %xmm8, %xmm3 +; SSE-NEXT: psubb %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: psubb %xmm4, %xmm8 +; SSE-NEXT: pcmpeqb %xmm4, %xmm3 ; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pcmpeqb %xmm8, %xmm2 +; SSE-NEXT: pcmpeqb %xmm4, %xmm2 ; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pcmpeqb %xmm8, %xmm1 +; SSE-NEXT: pcmpeqb %xmm4, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: pcmpeqb %xmm8, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: test13: @@ -1150,23 +1150,23 @@ define <32 x i16> @test31(<32 x i16> %x) { ; SSE-LABEL: test31: ; SSE: # %bb.0: -; SSE-NEXT: pcmpeqd %xmm8, %xmm8 +; SSE-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psubw %xmm8, %xmm5 +; SSE-NEXT: psubw %xmm4, %xmm5 ; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: psubw %xmm8, %xmm6 +; SSE-NEXT: psubw %xmm4, %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: psubw %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: psubw %xmm8, %xmm4 -; SSE-NEXT: pcmpeqw %xmm8, %xmm3 +; SSE-NEXT: psubw %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: psubw %xmm4, %xmm8 +; SSE-NEXT: pcmpeqw %xmm4, %xmm3 ; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pcmpeqw %xmm8, %xmm2 +; SSE-NEXT: pcmpeqw %xmm4, %xmm2 ; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pcmpeqw %xmm8, %xmm1 +; SSE-NEXT: pcmpeqw %xmm4, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: pcmpeqw %xmm8, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pcmpeqw %xmm4, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: test31: diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -870,17 +870,17 @@ ; SSE2-NEXT: pmullw %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: packuswb %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm8, %xmm9 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm5, %xmm1 ; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: packuswb %xmm4, %xmm1 +; SSE2-NEXT: packuswb %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm6, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: movdqa %xmm2, %xmm5 @@ -1237,67 +1237,67 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE2-LABEL: mul_v8i64_sext: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: pxor %xmm13, %xmm13 -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm14, %xmm10 -; SSE2-NEXT: movdqa %xmm14, %xmm8 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm11 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; SSE2-NEXT: pxor %xmm12, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE2-NEXT: pcmpgtd %xmm15, %xmm13 -; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm15, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm4 -; SSE2-NEXT: paddq %xmm6, %xmm4 -; SSE2-NEXT: psllq $32, %xmm4 -; SSE2-NEXT: pmuludq %xmm15, %xmm0 -; SSE2-NEXT: paddq %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,3,3] +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm13 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE2-NEXT: pcmpgtd %xmm4, %xmm12 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm11[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm12 +; SSE2-NEXT: paddq %xmm14, %xmm12 +; SSE2-NEXT: psllq $32, %xmm12 +; SSE2-NEXT: pmuludq %xmm4, %xmm0 +; SSE2-NEXT: paddq %xmm12, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,1,3,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm11, %xmm5 -; SSE2-NEXT: paddq %xmm4, %xmm5 -; SSE2-NEXT: psllq $32, %xmm5 -; SSE2-NEXT: pmuludq %xmm11, %xmm1 -; SSE2-NEXT: paddq %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm9, %xmm11 +; SSE2-NEXT: paddq %xmm4, %xmm11 +; SSE2-NEXT: psllq $32, %xmm11 +; SSE2-NEXT: pmuludq %xmm9, %xmm1 +; SSE2-NEXT: paddq %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,1,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm14, %xmm5 -; SSE2-NEXT: paddq %xmm4, %xmm5 -; SSE2-NEXT: psllq $32, %xmm5 -; SSE2-NEXT: pmuludq %xmm14, %xmm2 -; SSE2-NEXT: paddq %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm6, %xmm9 +; SSE2-NEXT: paddq %xmm4, %xmm9 +; SSE2-NEXT: psllq $32, %xmm9 +; SSE2-NEXT: pmuludq %xmm6, %xmm2 +; SSE2-NEXT: paddq %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm5 -; SSE2-NEXT: paddq %xmm4, %xmm5 -; SSE2-NEXT: psllq $32, %xmm5 -; SSE2-NEXT: pmuludq %xmm8, %xmm3 -; SSE2-NEXT: paddq %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm6 +; SSE2-NEXT: paddq %xmm4, %xmm6 +; SSE2-NEXT: psllq $32, %xmm6 +; SSE2-NEXT: pmuludq %xmm5, %xmm3 +; SSE2-NEXT: paddq %xmm6, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v8i64_sext: diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -322,44 +322,42 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: and_mulhuw_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm8, %xmm2 ; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pand %xmm6, %xmm8 -; SSE2-NEXT: packssdw %xmm7, %xmm8 -; SSE2-NEXT: pmulhw %xmm2, %xmm8 -; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: packssdw %xmm5, %xmm6 -; SSE2-NEXT: pmulhw %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: packssdw %xmm7, %xmm6 +; SSE2-NEXT: pmulhw %xmm2, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm8 +; SSE2-NEXT: packssdw %xmm5, %xmm8 +; SSE2-NEXT: pmulhw %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: and_mulhuw_v16i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767] -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: pand %xmm6, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] +; SSE41-NEXT: pand %xmm8, %xmm3 +; SSE41-NEXT: pand %xmm8, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm6, %xmm1 -; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pand %xmm8, %xmm1 +; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm6, %xmm7 -; SSE41-NEXT: pand %xmm6, %xmm8 -; SSE41-NEXT: packusdw %xmm7, %xmm8 -; SSE41-NEXT: pmulhw %xmm2, %xmm8 -; SSE41-NEXT: pand %xmm6, %xmm5 -; SSE41-NEXT: pand %xmm4, %xmm6 -; SSE41-NEXT: packusdw %xmm5, %xmm6 -; SSE41-NEXT: pmulhw %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm1 +; SSE41-NEXT: pand %xmm8, %xmm7 +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: packusdw %xmm7, %xmm6 +; SSE41-NEXT: pmulhw %xmm2, %xmm6 +; SSE41-NEXT: pand %xmm8, %xmm5 +; SSE41-NEXT: pand %xmm4, %xmm8 +; SSE41-NEXT: packusdw %xmm5, %xmm8 +; SSE41-NEXT: pmulhw %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: and_mulhuw_v16i16: @@ -1088,18 +1086,18 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhuw %xmm4, %xmm0 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pxor %xmm8, %xmm8 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE41-NEXT: pmulhuw %xmm5, %xmm1 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE41-NEXT: pmulhuw %xmm6, %xmm2 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] ; SSE41-NEXT: pmulhuw %xmm7, %xmm3 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] ; SSE41-NEXT: movdqa %xmm3, 112(%rdi) ; SSE41-NEXT: movdqa %xmm7, 96(%rdi) ; SSE41-NEXT: movdqa %xmm2, 80(%rdi) @@ -1107,7 +1105,7 @@ ; SSE41-NEXT: movdqa %xmm1, 48(%rdi) ; SSE41-NEXT: movdqa %xmm5, 32(%rdi) ; SSE41-NEXT: movdqa %xmm0, 16(%rdi) -; SSE41-NEXT: movdqa %xmm8, (%rdi) +; SSE41-NEXT: movdqa %xmm4, (%rdi) ; SSE41-NEXT: retq ; ; AVX2-LABEL: zext_mulhuw_v32i16_lshr: @@ -1183,18 +1181,18 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhw %xmm4, %xmm0 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pxor %xmm8, %xmm8 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE41-NEXT: pmulhw %xmm5, %xmm1 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE41-NEXT: pmulhw %xmm6, %xmm2 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] ; SSE41-NEXT: pmulhw %xmm7, %xmm3 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] ; SSE41-NEXT: movdqa %xmm3, 112(%rdi) ; SSE41-NEXT: movdqa %xmm7, 96(%rdi) ; SSE41-NEXT: movdqa %xmm2, 80(%rdi) @@ -1202,7 +1200,7 @@ ; SSE41-NEXT: movdqa %xmm1, 48(%rdi) ; SSE41-NEXT: movdqa %xmm5, 32(%rdi) ; SSE41-NEXT: movdqa %xmm0, 16(%rdi) -; SSE41-NEXT: movdqa %xmm8, (%rdi) +; SSE41-NEXT: movdqa %xmm4, (%rdi) ; SSE41-NEXT: retq ; ; AVX2-LABEL: mulhsw_v32i16_lshr: @@ -1713,58 +1711,58 @@ ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE2-NEXT: psrad $16, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; SSE2-NEXT: psrad $16, %xmm11 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; SSE2-NEXT: psrad $16, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] -; SSE2-NEXT: psrad $16, %xmm13 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; SSE2-NEXT: psrad $16, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] -; SSE2-NEXT: psrad $16, %xmm15 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; SSE2-NEXT: psrad $16, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm3 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; SSE2-NEXT: psrad $16, %xmm12 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; SSE2-NEXT: psrad $16, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; SSE2-NEXT: psrad $16, %xmm14 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; SSE2-NEXT: psrad $16, %xmm15 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $16, %xmm7 ; SSE2-NEXT: movdqa %xmm7, 240(%rdi) -; SSE2-NEXT: movdqa %xmm3, 224(%rdi) +; SSE2-NEXT: movdqa %xmm15, 224(%rdi) ; SSE2-NEXT: movdqa %xmm6, 208(%rdi) -; SSE2-NEXT: movdqa %xmm1, 192(%rdi) +; SSE2-NEXT: movdqa %xmm14, 192(%rdi) ; SSE2-NEXT: movdqa %xmm5, 176(%rdi) -; SSE2-NEXT: movdqa %xmm2, 160(%rdi) +; SSE2-NEXT: movdqa %xmm13, 160(%rdi) ; SSE2-NEXT: movdqa %xmm4, 144(%rdi) -; SSE2-NEXT: movdqa %xmm0, 128(%rdi) -; SSE2-NEXT: movdqa %xmm15, 112(%rdi) -; SSE2-NEXT: movdqa %xmm14, 96(%rdi) -; SSE2-NEXT: movdqa %xmm13, 80(%rdi) -; SSE2-NEXT: movdqa %xmm12, 64(%rdi) -; SSE2-NEXT: movdqa %xmm11, 48(%rdi) -; SSE2-NEXT: movdqa %xmm10, 32(%rdi) -; SSE2-NEXT: movdqa %xmm9, 16(%rdi) +; SSE2-NEXT: movdqa %xmm12, 128(%rdi) +; SSE2-NEXT: movdqa %xmm3, 112(%rdi) +; SSE2-NEXT: movdqa %xmm11, 96(%rdi) +; SSE2-NEXT: movdqa %xmm2, 80(%rdi) +; SSE2-NEXT: movdqa %xmm10, 64(%rdi) +; SSE2-NEXT: movdqa %xmm1, 48(%rdi) +; SSE2-NEXT: movdqa %xmm9, 32(%rdi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdi) ; SSE2-NEXT: movdqa %xmm8, (%rdi) ; SSE2-NEXT: retq ; @@ -1774,50 +1772,50 @@ ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE41-NEXT: pmovsxwd %xmm0, %xmm9 +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 -; SSE41-NEXT: pmovsxwd %xmm1, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE41-NEXT: pmovsxwd %xmm0, %xmm11 +; SSE41-NEXT: pmovsxwd %xmm1, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 -; SSE41-NEXT: pmovsxwd %xmm2, %xmm12 +; SSE41-NEXT: pmovsxwd %xmm2, %xmm10 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSE41-NEXT: pmovsxwd %xmm2, %xmm13 +; SSE41-NEXT: pmovsxwd %xmm2, %xmm2 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 -; SSE41-NEXT: pmovsxwd %xmm3, %xmm14 +; SSE41-NEXT: pmovsxwd %xmm3, %xmm11 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE41-NEXT: pmovsxwd %xmm3, %xmm15 +; SSE41-NEXT: pmovsxwd %xmm3, %xmm3 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 -; SSE41-NEXT: pmovsxwd %xmm4, %xmm0 +; SSE41-NEXT: pmovsxwd %xmm4, %xmm12 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm4, %xmm4 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 -; SSE41-NEXT: pmovsxwd %xmm5, %xmm2 +; SSE41-NEXT: pmovsxwd %xmm5, %xmm13 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm5, %xmm5 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 -; SSE41-NEXT: pmovsxwd %xmm6, %xmm1 +; SSE41-NEXT: pmovsxwd %xmm6, %xmm14 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm6, %xmm6 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 -; SSE41-NEXT: pmovsxwd %xmm7, %xmm3 +; SSE41-NEXT: pmovsxwd %xmm7, %xmm15 ; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm7, %xmm7 ; SSE41-NEXT: movdqa %xmm7, 240(%rdi) -; SSE41-NEXT: movdqa %xmm3, 224(%rdi) +; SSE41-NEXT: movdqa %xmm15, 224(%rdi) ; SSE41-NEXT: movdqa %xmm6, 208(%rdi) -; SSE41-NEXT: movdqa %xmm1, 192(%rdi) +; SSE41-NEXT: movdqa %xmm14, 192(%rdi) ; SSE41-NEXT: movdqa %xmm5, 176(%rdi) -; SSE41-NEXT: movdqa %xmm2, 160(%rdi) +; SSE41-NEXT: movdqa %xmm13, 160(%rdi) ; SSE41-NEXT: movdqa %xmm4, 144(%rdi) -; SSE41-NEXT: movdqa %xmm0, 128(%rdi) -; SSE41-NEXT: movdqa %xmm15, 112(%rdi) -; SSE41-NEXT: movdqa %xmm14, 96(%rdi) -; SSE41-NEXT: movdqa %xmm13, 80(%rdi) -; SSE41-NEXT: movdqa %xmm12, 64(%rdi) -; SSE41-NEXT: movdqa %xmm11, 48(%rdi) -; SSE41-NEXT: movdqa %xmm10, 32(%rdi) -; SSE41-NEXT: movdqa %xmm9, 16(%rdi) +; SSE41-NEXT: movdqa %xmm12, 128(%rdi) +; SSE41-NEXT: movdqa %xmm3, 112(%rdi) +; SSE41-NEXT: movdqa %xmm11, 96(%rdi) +; SSE41-NEXT: movdqa %xmm2, 80(%rdi) +; SSE41-NEXT: movdqa %xmm10, 64(%rdi) +; SSE41-NEXT: movdqa %xmm1, 48(%rdi) +; SSE41-NEXT: movdqa %xmm9, 32(%rdi) +; SSE41-NEXT: movdqa %xmm0, 16(%rdi) ; SSE41-NEXT: movdqa %xmm8, (%rdi) ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll --- a/llvm/test/CodeGen/X86/popcnt.ll +++ b/llvm/test/CodeGen/X86/popcnt.ll @@ -394,10 +394,10 @@ ; X64-NEXT: movq %rax, %rdx ; X64-NEXT: shrq $4, %rdx ; X64-NEXT: addq %rax, %rdx -; X64-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %r9, %rdx -; X64-NEXT: movabsq $72340172838076673, %rsi # imm = 0x101010101010101 -; X64-NEXT: imulq %rsi, %rdx +; X64-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %rsi, %rdx +; X64-NEXT: movabsq $72340172838076673, %r9 # imm = 0x101010101010101 +; X64-NEXT: imulq %r9, %rdx ; X64-NEXT: shrq $56, %rdx ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: shrq %rax @@ -411,8 +411,8 @@ ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: shrq $4, %rax ; X64-NEXT: addq %rcx, %rax -; X64-NEXT: andq %r9, %rax -; X64-NEXT: imulq %rsi, %rax +; X64-NEXT: andq %rsi, %rax +; X64-NEXT: imulq %r9, %rax ; X64-NEXT: shrq $56, %rax ; X64-NEXT: addq %rdx, %rax ; X64-NEXT: xorl %edx, %edx @@ -899,10 +899,10 @@ ; X64-NEXT: movq %rax, %rdx ; X64-NEXT: shrq $4, %rdx ; X64-NEXT: addq %rax, %rdx -; X64-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %r9, %rdx -; X64-NEXT: movabsq $72340172838076673, %rsi # imm = 0x101010101010101 -; X64-NEXT: imulq %rsi, %rdx +; X64-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %rsi, %rdx +; X64-NEXT: movabsq $72340172838076673, %r9 # imm = 0x101010101010101 +; X64-NEXT: imulq %r9, %rdx ; X64-NEXT: shrq $56, %rdx ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: shrq %rax @@ -916,8 +916,8 @@ ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: shrq $4, %rax ; X64-NEXT: addq %rcx, %rax -; X64-NEXT: andq %r9, %rax -; X64-NEXT: imulq %rsi, %rax +; X64-NEXT: andq %rsi, %rax +; X64-NEXT: imulq %r9, %rax ; X64-NEXT: shrq $56, %rax ; X64-NEXT: addq %rdx, %rax ; X64-NEXT: xorl %edx, %edx @@ -1329,10 +1329,10 @@ ; X64-NEXT: movq %rax, %rdx ; X64-NEXT: shrq $4, %rdx ; X64-NEXT: addq %rax, %rdx -; X64-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %r9, %rdx -; X64-NEXT: movabsq $72340172838076673, %rsi # imm = 0x101010101010101 -; X64-NEXT: imulq %rsi, %rdx +; X64-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %rsi, %rdx +; X64-NEXT: movabsq $72340172838076673, %r9 # imm = 0x101010101010101 +; X64-NEXT: imulq %r9, %rdx ; X64-NEXT: shrq $56, %rdx ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: shrq %rax @@ -1346,8 +1346,8 @@ ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: shrq $4, %rax ; X64-NEXT: addq %rcx, %rax -; X64-NEXT: andq %r9, %rax -; X64-NEXT: imulq %rsi, %rax +; X64-NEXT: andq %rsi, %rax +; X64-NEXT: imulq %r9, %rax ; X64-NEXT: shrq $56, %rax ; X64-NEXT: addq %rdx, %rax ; X64-NEXT: xorl %edx, %edx diff --git a/llvm/test/CodeGen/X86/pr18344.ll b/llvm/test/CodeGen/X86/pr18344.ll --- a/llvm/test/CodeGen/X86/pr18344.ll +++ b/llvm/test/CodeGen/X86/pr18344.ll @@ -37,13 +37,13 @@ ; X64-NEXT: movdqu (%rdx), %xmm0 ; X64-NEXT: pslld $4, %xmm0 ; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: movslq %eax, %r8 +; X64-NEXT: cltq ; X64-NEXT: pextrd $1, %xmm0, %ecx ; X64-NEXT: movslq %ecx, %rcx ; X64-NEXT: pextrd $2, %xmm0, %edx ; X64-NEXT: movslq %edx, %rdx -; X64-NEXT: pextrd $3, %xmm0, %eax -; X64-NEXT: cltq +; X64-NEXT: pextrd $3, %xmm0, %r8d +; X64-NEXT: movslq %r8d, %r8 ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero diff --git a/llvm/test/CodeGen/X86/pr21792.ll b/llvm/test/CodeGen/X86/pr21792.ll --- a/llvm/test/CodeGen/X86/pr21792.ll +++ b/llvm/test/CodeGen/X86/pr21792.ll @@ -12,16 +12,16 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movd %xmm0, %r8d -; CHECK-NEXT: leaq stuff(%r8), %rdi -; CHECK-NEXT: pextrd $1, %xmm0, %eax -; CHECK-NEXT: leaq stuff(%rax), %rsi -; CHECK-NEXT: pextrd $2, %xmm0, %edx -; CHECK-NEXT: pextrd $3, %xmm0, %ecx -; CHECK-NEXT: leaq stuff(%rdx), %rdx -; CHECK-NEXT: leaq stuff(%rcx), %rcx -; CHECK-NEXT: leaq stuff+8(%r8), %r8 -; CHECK-NEXT: leaq stuff+8(%rax), %r9 +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: leaq stuff(%rax), %rdi +; CHECK-NEXT: pextrd $1, %xmm0, %r9d +; CHECK-NEXT: leaq stuff(%r9), %rsi +; CHECK-NEXT: pextrd $2, %xmm0, %ecx +; CHECK-NEXT: pextrd $3, %xmm0, %r8d +; CHECK-NEXT: leaq stuff(%rcx), %rdx +; CHECK-NEXT: leaq stuff(%r8), %rcx +; CHECK-NEXT: leaq stuff+8(%rax), %r8 +; CHECK-NEXT: leaq stuff+8(%r9), %r9 ; CHECK-NEXT: callq toto@PLT ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 diff --git a/llvm/test/CodeGen/X86/pr23603.ll b/llvm/test/CodeGen/X86/pr23603.ll --- a/llvm/test/CodeGen/X86/pr23603.ll +++ b/llvm/test/CodeGen/X86/pr23603.ll @@ -9,14 +9,14 @@ ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: movq %rdx, %r14 +; CHECK-NEXT: movq %rdx, %rbx ; CHECK-NEXT: movl %esi, %ebp -; CHECK-NEXT: movl (%rdi), %ebx +; CHECK-NEXT: movl (%rdi), %r14d ; CHECK-NEXT: callq free_v@PLT ; CHECK-NEXT: testl %ebp, %ebp ; CHECK-NEXT: je .LBB0_2 ; CHECK-NEXT: # %bb.1: # %left -; CHECK-NEXT: movl %ebx, (%r14) +; CHECK-NEXT: movl %r14d, (%rbx) ; CHECK-NEXT: .LBB0_2: # %merge ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll --- a/llvm/test/CodeGen/X86/pr29112.ll +++ b/llvm/test/CodeGen/X86/pr29112.ll @@ -10,11 +10,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: vmovaps %xmm1, %xmm9 -; CHECK-NEXT: vmovaps {{.*#+}} xmm14 = [4,22,1,17] -; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm14 -; CHECK-NEXT: vmovaps {{.*#+}} xmm10 = [4,30,1,22] -; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10 +; CHECK-NEXT: vmovaps %xmm1, %xmm13 +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4,22,1,17] +; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm0 +; CHECK-NEXT: vmovaps {{.*#+}} xmm12 = [4,30,1,22] +; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm12 ; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,29] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8 ; CHECK-NEXT: vmovaps {{.*#+}} xmm7 = <5,20,u,u> @@ -22,37 +22,36 @@ ; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,7] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4 ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm5 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm6 -; CHECK-NEXT: vunpcklps {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0,1],xmm2[1],xmm11[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm1[0,1,2],xmm3[1] +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm9 +; CHECK-NEXT: vunpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm10 = xmm9[0,1],xmm2[1],xmm9[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0,1,2],xmm3[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1] ; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4 -; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,3,3,3] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3] +; CHECK-NEXT: vpermilps {{.*#+}} xmm11 = xmm2[3,3,3,3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] ; CHECK-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3] -; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3] -; CHECK-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm3[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1] -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm8 -; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm7[0,1],xmm2[1],xmm7[3] +; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm3[3] +; CHECK-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1,2],xmm3[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1,2],xmm3[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm11[0,1,2],xmm3[1] +; CHECK-NEXT: vaddps %xmm8, %xmm11, %xmm8 +; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm9[0,1],xmm2[3,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2] -; CHECK-NEXT: vaddps %xmm2, %xmm14, %xmm2 -; CHECK-NEXT: vmovaps %xmm13, %xmm1 -; CHECK-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vaddps %xmm10, %xmm13, %xmm10 -; CHECK-NEXT: vaddps %xmm13, %xmm13, %xmm3 -; CHECK-NEXT: vaddps %xmm12, %xmm14, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm2 +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vaddps %xmm1, %xmm12, %xmm9 +; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm3 +; CHECK-NEXT: vaddps %xmm0, %xmm10, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm8, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm13, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %xmm10, (%rsp) -; CHECK-NEXT: vmovaps %xmm9, %xmm3 +; CHECK-NEXT: vmovaps %xmm9, (%rsp) +; CHECK-NEXT: vmovaps %xmm13, %xmm3 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo@PLT ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/pr32329.ll b/llvm/test/CodeGen/X86/pr32329.ll --- a/llvm/test/CodeGen/X86/pr32329.ll +++ b/llvm/test/CodeGen/X86/pr32329.ll @@ -68,26 +68,26 @@ ; ; X64-LABEL: foo: ; X64: # %bb.0: # %entry -; X64-NEXT: movsbl var_27(%rip), %r9d -; X64-NEXT: movzwl var_2(%rip), %r8d +; X64-NEXT: movsbl var_27(%rip), %eax +; X64-NEXT: movzwl var_2(%rip), %edx ; X64-NEXT: movl var_310(%rip), %ecx -; X64-NEXT: imull %r9d, %ecx +; X64-NEXT: imull %eax, %ecx ; X64-NEXT: addl var_24(%rip), %ecx ; X64-NEXT: movl $4194303, %esi # imm = 0x3FFFFF ; X64-NEXT: andl obj(%rip), %esi ; X64-NEXT: leal (%rsi,%rsi), %edi -; X64-NEXT: subl %r9d, %edi -; X64-NEXT: movl %edi, %edx -; X64-NEXT: subl %r8d, %edx -; X64-NEXT: imull %edx, %ecx +; X64-NEXT: subl %eax, %edi +; X64-NEXT: movl %edi, %r8d +; X64-NEXT: subl %edx, %r8d +; X64-NEXT: imull %r8d, %ecx ; X64-NEXT: addb $113, %cl -; X64-NEXT: movl $9, %eax +; X64-NEXT: movl $9, %edx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shlq %cl, %rax -; X64-NEXT: movq %rax, var_50(%rip) -; X64-NEXT: cmpl %esi, %edx +; X64-NEXT: shlq %cl, %rdx +; X64-NEXT: movq %rdx, var_50(%rip) +; X64-NEXT: cmpl %esi, %r8d ; X64-NEXT: setge var_205(%rip) -; X64-NEXT: imull %r9d, %edi +; X64-NEXT: imull %eax, %edi ; X64-NEXT: movb %dil, var_218(%rip) ; X64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr35316.ll b/llvm/test/CodeGen/X86/pr35316.ll --- a/llvm/test/CodeGen/X86/pr35316.ll +++ b/llvm/test/CodeGen/X86/pr35316.ll @@ -26,19 +26,19 @@ ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movl $0, b(%rip) -; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d -; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl a(%rip) ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: movl c(%rip), %eax ; CHECK-NEXT: cltd -; CHECK-NEXT: idivl %esi +; CHECK-NEXT: idivl %r8d ; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: andl %r8d, %eax +; CHECK-NEXT: andl %esi, %eax ; CHECK-NEXT: movl %eax, (%rax) ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr38185.ll b/llvm/test/CodeGen/X86/pr38185.ll --- a/llvm/test/CodeGen/X86/pr38185.ll +++ b/llvm/test/CodeGen/X86/pr38185.ll @@ -8,19 +8,19 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %r9 -; CHECK-NEXT: cmpq %rcx, %r9 +; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: cmpq %rcx, %rax ; CHECK-NEXT: je .LBB0_3 ; CHECK-NEXT: # %bb.2: # %body ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: movl $1, (%rdx,%r9,4) -; CHECK-NEXT: movzbl (%rdi,%r9,4), %r8d -; CHECK-NEXT: movzbl (%rsi,%r9,4), %eax -; CHECK-NEXT: andl %r8d, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: movl %eax, (%rdi,%r9,4) -; CHECK-NEXT: incq %r9 -; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, (%rdx,%rax,4) +; CHECK-NEXT: movzbl (%rdi,%rax,4), %r8d +; CHECK-NEXT: movzbl (%rsi,%rax,4), %r9d +; CHECK-NEXT: andl %r8d, %r9d +; CHECK-NEXT: andl $1, %r9d +; CHECK-NEXT: movl %r9d, (%rdi,%rax,4) +; CHECK-NEXT: incq %rax +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: jmp .LBB0_1 ; CHECK-NEXT: .LBB0_3: # %endloop ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr38217.ll b/llvm/test/CodeGen/X86/pr38217.ll --- a/llvm/test/CodeGen/X86/pr38217.ll +++ b/llvm/test/CodeGen/X86/pr38217.ll @@ -9,31 +9,30 @@ ; CHECK-NEXT: cmpq $10000, %rdi # imm = 0x2710 ; CHECK-NEXT: jb .LBB0_3 ; CHECK-NEXT: # %bb.1: # %.preheader -; CHECK-NEXT: movq %rdi, %r9 -; CHECK-NEXT: xorl %r10d, %r10d +; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: movabsq $3777893186295716171, %r8 # imm = 0x346DC5D63886594B ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %r9, %rax +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: mulq %r8 ; CHECK-NEXT: shrq $11, %rdx ; CHECK-NEXT: imulq $10000, %rdx, %rax # imm = 0x2710 -; CHECK-NEXT: movq %r9, %rdi -; CHECK-NEXT: subq %rax, %rdi -; CHECK-NEXT: imulq $1374389535, %rdi, %rax # imm = 0x51EB851F +; CHECK-NEXT: movq %rdi, %r9 +; CHECK-NEXT: subq %rax, %r9 +; CHECK-NEXT: imulq $1374389535, %r9, %rax # imm = 0x51EB851F ; CHECK-NEXT: shrq $37, %rax -; CHECK-NEXT: imull $100, %eax, %ecx -; CHECK-NEXT: subl %ecx, %edi -; CHECK-NEXT: movl %r10d, %r11d -; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: subq %r11, %rcx -; CHECK-NEXT: movzwl _ZL11DIGIT_TABLE(%rdi,%rdi), %edi -; CHECK-NEXT: movw %di, -1(%rcx) +; CHECK-NEXT: imull $100, %eax, %r10d +; CHECK-NEXT: subl %r10d, %r9d +; CHECK-NEXT: movl %ecx, %r10d +; CHECK-NEXT: movq %rsi, %r11 +; CHECK-NEXT: subq %r10, %r11 +; CHECK-NEXT: movzwl _ZL11DIGIT_TABLE(%r9,%r9), %r9d +; CHECK-NEXT: movw %r9w, -1(%r11) ; CHECK-NEXT: movzwl _ZL11DIGIT_TABLE(%rax,%rax), %eax -; CHECK-NEXT: movw %ax, -3(%rcx) -; CHECK-NEXT: addl $4, %r10d -; CHECK-NEXT: cmpq $99999999, %r9 # imm = 0x5F5E0FF -; CHECK-NEXT: movq %rdx, %r9 +; CHECK-NEXT: movw %ax, -3(%r11) +; CHECK-NEXT: addl $4, %ecx +; CHECK-NEXT: cmpq $99999999, %rdi # imm = 0x5F5E0FF +; CHECK-NEXT: movq %rdx, %rdi ; CHECK-NEXT: ja .LBB0_2 ; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr43820.ll b/llvm/test/CodeGen/X86/pr43820.ll --- a/llvm/test/CodeGen/X86/pr43820.ll +++ b/llvm/test/CodeGen/X86/pr43820.ll @@ -12,311 +12,310 @@ ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r11 -; CHECK-NEXT: shrq $4, %r11 -; CHECK-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F -; CHECK-NEXT: andq %rsi, %r11 -; CHECK-NEXT: andq %rsi, %rbp -; CHECK-NEXT: shlq $4, %rbp -; CHECK-NEXT: orq %r11, %rbp -; CHECK-NEXT: movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333 -; CHECK-NEXT: movq %rbp, %r12 -; CHECK-NEXT: andq %rdi, %r12 -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: andq %rdi, %rbp -; CHECK-NEXT: leaq (%rbp,%r12,4), %rbp -; CHECK-NEXT: movabsq $6148914691230924800, %r12 # imm = 0x5555555555000000 -; CHECK-NEXT: movq %rbp, %r13 -; CHECK-NEXT: andq %r12, %r13 -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: andq %r12, %rbp -; CHECK-NEXT: leaq (%rbp,%r13,2), %rax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: bswapq %rbx -; CHECK-NEXT: movq %rbx, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: andq %rsi, %rbp -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: shlq $4, %rbx -; CHECK-NEXT: orq %rbp, %rbx -; CHECK-NEXT: movq %rbx, %rbp -; CHECK-NEXT: andq %rdi, %rbp -; CHECK-NEXT: shrq $2, %rbx -; CHECK-NEXT: andq %rdi, %rbx -; CHECK-NEXT: leaq (%rbx,%rbp,4), %rbp -; CHECK-NEXT: movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555 -; CHECK-NEXT: movq %rbp, %r12 -; CHECK-NEXT: andq %rbx, %r12 -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: andq %rbx, %rbp -; CHECK-NEXT: leaq (%rbp,%r12,2), %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; CHECK-NEXT: bswapq %r12 +; CHECK-NEXT: movq %r12, %r10 +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F +; CHECK-NEXT: andq %rax, %r10 +; CHECK-NEXT: andq %rax, %r12 +; CHECK-NEXT: shlq $4, %r12 +; CHECK-NEXT: orq %r10, %r12 +; CHECK-NEXT: movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333 +; CHECK-NEXT: movq %r12, %r13 +; CHECK-NEXT: andq %r10, %r13 +; CHECK-NEXT: shrq $2, %r12 +; CHECK-NEXT: andq %r10, %r12 +; CHECK-NEXT: leaq (%r12,%r13,4), %r12 +; CHECK-NEXT: movabsq $6148914691230924800, %r13 # imm = 0x5555555555000000 +; CHECK-NEXT: movq %r12, %rbp +; CHECK-NEXT: andq %r13, %rbp +; CHECK-NEXT: shrq %r12 +; CHECK-NEXT: andq %r13, %r12 +; CHECK-NEXT: leaq (%r12,%rbp,2), %rsi +; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: bswapq %r14 +; CHECK-NEXT: movq %r14, %r12 +; CHECK-NEXT: shrq $4, %r12 +; CHECK-NEXT: movq %rax, %rbp +; CHECK-NEXT: andq %rax, %r12 +; CHECK-NEXT: andq %rax, %r14 +; CHECK-NEXT: shlq $4, %r14 +; CHECK-NEXT: orq %r12, %r14 +; CHECK-NEXT: movq %r14, %r12 +; CHECK-NEXT: andq %r10, %r12 +; CHECK-NEXT: shrq $2, %r14 +; CHECK-NEXT: andq %r10, %r14 +; CHECK-NEXT: leaq (%r14,%r12,4), %r12 +; CHECK-NEXT: movabsq $6148914691236517205, %r14 # imm = 0x5555555555555555 +; CHECK-NEXT: movq %r12, %r13 +; CHECK-NEXT: andq %r14, %r13 +; CHECK-NEXT: shrq %r12 +; CHECK-NEXT: andq %r14, %r12 +; CHECK-NEXT: leaq (%r12,%r13,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %r15 -; CHECK-NEXT: movq %r15, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: andq %rsi, %rbp -; CHECK-NEXT: andq %rsi, %r15 +; CHECK-NEXT: movq %r15, %r12 +; CHECK-NEXT: shrq $4, %r12 +; CHECK-NEXT: andq %rbp, %r12 +; CHECK-NEXT: andq %rbp, %r15 ; CHECK-NEXT: shlq $4, %r15 -; CHECK-NEXT: orq %rbp, %r15 -; CHECK-NEXT: movq %r15, %rbp -; CHECK-NEXT: andq %rdi, %rbp +; CHECK-NEXT: orq %r12, %r15 +; CHECK-NEXT: movq %r15, %r12 +; CHECK-NEXT: andq %r10, %r12 ; CHECK-NEXT: shrq $2, %r15 -; CHECK-NEXT: andq %rdi, %r15 -; CHECK-NEXT: leaq (%r15,%rbp,4), %rbp -; CHECK-NEXT: movq %rbp, %r15 -; CHECK-NEXT: andq %rbx, %r15 -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: andq %rbx, %rbp -; CHECK-NEXT: leaq (%rbp,%r15,2), %rax +; CHECK-NEXT: andq %r10, %r15 +; CHECK-NEXT: leaq (%r15,%r12,4), %r15 +; CHECK-NEXT: movq %r15, %r12 +; CHECK-NEXT: andq %r14, %r12 +; CHECK-NEXT: shrq %r15 +; CHECK-NEXT: andq %r14, %r15 +; CHECK-NEXT: leaq (%r15,%r12,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: bswapq %r14 -; CHECK-NEXT: movq %r14, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: andq %rsi, %rbp -; CHECK-NEXT: andq %rsi, %r14 -; CHECK-NEXT: shlq $4, %r14 -; CHECK-NEXT: orq %rbp, %r14 -; CHECK-NEXT: movq %r14, %rbp -; CHECK-NEXT: andq %rdi, %rbp -; CHECK-NEXT: shrq $2, %r14 -; CHECK-NEXT: andq %rdi, %r14 -; CHECK-NEXT: leaq (%r14,%rbp,4), %rbp -; CHECK-NEXT: movq %rbp, %r14 -; CHECK-NEXT: andq %rbx, %r14 -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: andq %rbx, %rbp -; CHECK-NEXT: leaq (%rbp,%r14,2), %rax +; CHECK-NEXT: bswapq %rbx +; CHECK-NEXT: movq %rbx, %r15 +; CHECK-NEXT: shrq $4, %r15 +; CHECK-NEXT: andq %rbp, %r15 +; CHECK-NEXT: andq %rbp, %rbx +; CHECK-NEXT: shlq $4, %rbx +; CHECK-NEXT: orq %r15, %rbx +; CHECK-NEXT: movq %rbx, %r15 +; CHECK-NEXT: andq %r10, %r15 +; CHECK-NEXT: shrq $2, %rbx +; CHECK-NEXT: andq %r10, %rbx +; CHECK-NEXT: leaq (%rbx,%r15,4), %rbx +; CHECK-NEXT: movq %rbx, %r15 +; CHECK-NEXT: andq %r14, %r15 +; CHECK-NEXT: shrq %rbx +; CHECK-NEXT: andq %r14, %rbx +; CHECK-NEXT: leaq (%rbx,%r15,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: bswapq %r10 -; CHECK-NEXT: movq %r10, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: andq %rsi, %rbp -; CHECK-NEXT: andq %rsi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: orq %rbp, %r10 -; CHECK-NEXT: movq %r10, %rbp -; CHECK-NEXT: andq %rdi, %rbp -; CHECK-NEXT: shrq $2, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: leaq (%r10,%rbp,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: andq %rbx, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rax +; CHECK-NEXT: bswapq %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: shrq $4, %rbx +; CHECK-NEXT: andq %rbp, %rbx +; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: shlq $4, %rdi +; CHECK-NEXT: orq %rbx, %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: andq %r10, %rbx +; CHECK-NEXT: shrq $2, %rdi +; CHECK-NEXT: andq %r10, %rdi +; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: andq %r14, %rbx +; CHECK-NEXT: shrq %rdi +; CHECK-NEXT: andq %r14, %rdi +; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: shrq $4, %r10 -; CHECK-NEXT: andq %rsi, %r10 -; CHECK-NEXT: andq %rsi, %rbp -; CHECK-NEXT: shlq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: andq %rdi, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: andq %rbx, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: bswapq %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: shrq $4, %rbx +; CHECK-NEXT: andq %rbp, %rbx +; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: shlq $4, %rdi +; CHECK-NEXT: orq %rbx, %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: andq %r10, %rbx +; CHECK-NEXT: shrq $2, %rdi +; CHECK-NEXT: andq %r10, %rdi +; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: andq %r14, %rbx +; CHECK-NEXT: shrq %rdi +; CHECK-NEXT: andq %r14, %rdi +; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; CHECK-NEXT: bswapq %r10 -; CHECK-NEXT: movq %r10, %r14 -; CHECK-NEXT: shrq $4, %r14 -; CHECK-NEXT: andq %rsi, %r14 -; CHECK-NEXT: andq %rsi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: orq %r14, %r10 -; CHECK-NEXT: movq %r10, %r14 -; CHECK-NEXT: andq %rdi, %r14 -; CHECK-NEXT: shrq $2, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: movq %rdi, %rbp -; CHECK-NEXT: leaq (%r10,%r14,4), %r10 -; CHECK-NEXT: movq %r10, %r14 -; CHECK-NEXT: andq %rbx, %r14 -; CHECK-NEXT: shrq %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: leaq (%r10,%r14,2), %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: bswapq %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: shrq $4, %rbx +; CHECK-NEXT: andq %rbp, %rbx +; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: shlq $4, %rdi +; CHECK-NEXT: orq %rbx, %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: andq %r10, %rbx +; CHECK-NEXT: shrq $2, %rdi +; CHECK-NEXT: andq %r10, %rdi +; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: andq %r14, %rbx +; CHECK-NEXT: shrq %rdi +; CHECK-NEXT: andq %r14, %rdi +; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; CHECK-NEXT: bswapq %r10 -; CHECK-NEXT: movq %r10, %r14 -; CHECK-NEXT: shrq $4, %r14 -; CHECK-NEXT: andq %rsi, %r14 -; CHECK-NEXT: andq %rsi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: orq %r14, %r10 -; CHECK-NEXT: movq %r10, %r14 -; CHECK-NEXT: andq %rdi, %r14 -; CHECK-NEXT: shrq $2, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: leaq (%r10,%r14,4), %r10 -; CHECK-NEXT: movq %r10, %r14 -; CHECK-NEXT: andq %rbx, %r14 -; CHECK-NEXT: shrq %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: leaq (%r10,%r14,2), %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: bswapq %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: shrq $4, %rbx +; CHECK-NEXT: andq %rbp, %rbx +; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: shlq $4, %rdi +; CHECK-NEXT: orq %rbx, %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: andq %r10, %rbx +; CHECK-NEXT: shrq $2, %rdi +; CHECK-NEXT: andq %r10, %rdi +; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: andq %r14, %rbx +; CHECK-NEXT: shrq %rdi +; CHECK-NEXT: andq %r14, %rdi +; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; CHECK-NEXT: bswapq %r10 -; CHECK-NEXT: movq %r10, %r14 -; CHECK-NEXT: shrq $4, %r14 -; CHECK-NEXT: andq %rsi, %r14 -; CHECK-NEXT: andq %rsi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: orq %r14, %r10 -; CHECK-NEXT: movq %r10, %r14 -; CHECK-NEXT: andq %rdi, %r14 -; CHECK-NEXT: shrq $2, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: leaq (%r10,%r14,4), %r10 -; CHECK-NEXT: movq %r10, %r14 -; CHECK-NEXT: andq %rbx, %r14 -; CHECK-NEXT: shrq %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: leaq (%r10,%r14,2), %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: bswapq %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: shrq $4, %rbx +; CHECK-NEXT: andq %rbp, %rbx +; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: shlq $4, %rdi +; CHECK-NEXT: orq %rbx, %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: andq %r10, %rbx +; CHECK-NEXT: shrq $2, %rdi +; CHECK-NEXT: andq %r10, %rdi +; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: andq %r14, %rbx +; CHECK-NEXT: shrq %rdi +; CHECK-NEXT: andq %r14, %rdi +; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; CHECK-NEXT: bswapq %r10 -; CHECK-NEXT: movq %r10, %r14 -; CHECK-NEXT: shrq $4, %r14 -; CHECK-NEXT: andq %rsi, %r14 -; CHECK-NEXT: andq %rsi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: orq %r14, %r10 -; CHECK-NEXT: movq %r10, %r14 -; CHECK-NEXT: andq %rdi, %r14 -; CHECK-NEXT: shrq $2, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: leaq (%r10,%r14,4), %r10 -; CHECK-NEXT: movq %r10, %r14 -; CHECK-NEXT: andq %rbx, %r14 -; CHECK-NEXT: shrq %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: leaq (%r10,%r14,2), %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: bswapq %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: shrq $4, %rbx +; CHECK-NEXT: andq %rbp, %rbx +; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: shlq $4, %rdi +; CHECK-NEXT: orq %rbx, %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: andq %r10, %rbx +; CHECK-NEXT: shrq $2, %rdi +; CHECK-NEXT: andq %r10, %rdi +; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: andq %r14, %rbx +; CHECK-NEXT: shrq %rdi +; CHECK-NEXT: andq %r14, %rdi +; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; CHECK-NEXT: bswapq %r10 -; CHECK-NEXT: movq %r10, %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: bswapq %rdi +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: orq %rax, %r10 -; CHECK-NEXT: movq %r10, %rax -; CHECK-NEXT: andq %rdi, %rax -; CHECK-NEXT: shrq $2, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: leaq (%r10,%rax,4), %rax -; CHECK-NEXT: movq %rax, %r10 -; CHECK-NEXT: andq %rbx, %r10 +; CHECK-NEXT: andq %rbp, %rax +; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: shlq $4, %rdi +; CHECK-NEXT: orq %rax, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: andq %r10, %rax +; CHECK-NEXT: shrq $2, %rdi +; CHECK-NEXT: andq %r10, %rdi +; CHECK-NEXT: leaq (%rdi,%rax,4), %rax +; CHECK-NEXT: movq %rax, %rdi +; CHECK-NEXT: andq %r14, %rdi ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andq %rbx, %rax -; CHECK-NEXT: leaq (%rax,%r10,2), %rax +; CHECK-NEXT: andq %r14, %rax +; CHECK-NEXT: leaq (%rax,%rdi,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %r9 ; CHECK-NEXT: movq %r9, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %r9 +; CHECK-NEXT: andq %rbp, %rax +; CHECK-NEXT: andq %rbp, %r9 ; CHECK-NEXT: shlq $4, %r9 ; CHECK-NEXT: orq %rax, %r9 ; CHECK-NEXT: movq %r9, %rax -; CHECK-NEXT: andq %rdi, %rax +; CHECK-NEXT: andq %r10, %rax ; CHECK-NEXT: shrq $2, %r9 -; CHECK-NEXT: andq %rdi, %r9 +; CHECK-NEXT: andq %r10, %r9 ; CHECK-NEXT: leaq (%r9,%rax,4), %rax ; CHECK-NEXT: movq %rax, %r9 -; CHECK-NEXT: andq %rbx, %r9 +; CHECK-NEXT: andq %r14, %r9 ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andq %rbx, %rax +; CHECK-NEXT: andq %r14, %rax ; CHECK-NEXT: leaq (%rax,%r9,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %r8 ; CHECK-NEXT: movq %r8, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %r8 +; CHECK-NEXT: andq %rbp, %rax +; CHECK-NEXT: andq %rbp, %r8 ; CHECK-NEXT: shlq $4, %r8 ; CHECK-NEXT: orq %rax, %r8 ; CHECK-NEXT: movq %r8, %rax -; CHECK-NEXT: andq %rdi, %rax +; CHECK-NEXT: andq %r10, %rax ; CHECK-NEXT: shrq $2, %r8 -; CHECK-NEXT: andq %rdi, %r8 +; CHECK-NEXT: andq %r10, %r8 ; CHECK-NEXT: leaq (%r8,%rax,4), %rax ; CHECK-NEXT: movq %rax, %r8 -; CHECK-NEXT: andq %rbx, %r8 +; CHECK-NEXT: andq %r14, %r8 ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andq %rbx, %rax -; CHECK-NEXT: leaq (%rax,%r8,2), %rax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: andq %r14, %rax +; CHECK-NEXT: leaq (%rax,%r8,2), %r8 ; CHECK-NEXT: bswapq %rcx ; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %rcx +; CHECK-NEXT: andq %rbp, %rax +; CHECK-NEXT: andq %rbp, %rcx ; CHECK-NEXT: shlq $4, %rcx ; CHECK-NEXT: orq %rax, %rcx ; CHECK-NEXT: movq %rcx, %rax -; CHECK-NEXT: andq %rdi, %rax +; CHECK-NEXT: andq %r10, %rax ; CHECK-NEXT: shrq $2, %rcx -; CHECK-NEXT: andq %rdi, %rcx +; CHECK-NEXT: andq %r10, %rcx ; CHECK-NEXT: leaq (%rcx,%rax,4), %rax ; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: andq %rbx, %rcx +; CHECK-NEXT: andq %r14, %rcx ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andq %rbx, %rax +; CHECK-NEXT: andq %r14, %rax ; CHECK-NEXT: leaq (%rax,%rcx,2), %r12 ; CHECK-NEXT: bswapq %rdx ; CHECK-NEXT: movq %rdx, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %rdx +; CHECK-NEXT: andq %rbp, %rax +; CHECK-NEXT: andq %rbp, %rdx ; CHECK-NEXT: shlq $4, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %rax -; CHECK-NEXT: andq %rdi, %rax +; CHECK-NEXT: andq %r10, %rax ; CHECK-NEXT: shrq $2, %rdx -; CHECK-NEXT: andq %rdi, %rdx +; CHECK-NEXT: andq %r10, %rdx ; CHECK-NEXT: leaq (%rdx,%rax,4), %rax ; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: andq %rbx, %rdx +; CHECK-NEXT: andq %r14, %rdx ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andq %rbx, %rax +; CHECK-NEXT: andq %r14, %rax ; CHECK-NEXT: leaq (%rax,%rdx,2), %rdi ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-NEXT: bswapq %rax ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $4, %rcx -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rbp, %rcx +; CHECK-NEXT: andq %rbp, %rax ; CHECK-NEXT: shlq $4, %rax ; CHECK-NEXT: orq %rcx, %rax ; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: andq %rbp, %rcx +; CHECK-NEXT: andq %r10, %rcx ; CHECK-NEXT: shrq $2, %rax -; CHECK-NEXT: andq %rbp, %rax +; CHECK-NEXT: andq %r10, %rax ; CHECK-NEXT: leaq (%rax,%rcx,4), %rax -; CHECK-NEXT: movq %rax, %rsi -; CHECK-NEXT: andq %rbx, %rsi +; CHECK-NEXT: movq %rax, %r10 +; CHECK-NEXT: andq %r14, %r10 ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andq %rbx, %rax -; CHECK-NEXT: leaq (%rax,%rsi,2), %rsi -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; CHECK-NEXT: andq %r14, %rax +; CHECK-NEXT: leaq (%rax,%r10,2), %rdx +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rax, %rdx +; CHECK-NEXT: shrdq $24, %rax, %rsi ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: shrdq $24, %rcx, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -337,14 +336,15 @@ ; CHECK-NEXT: shrdq $24, %r10, %r11 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; CHECK-NEXT: shrdq $24, %r9, %r10 -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r8, %r9 -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rax, %r8 +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; CHECK-NEXT: shrdq $24, %rcx, %r9 +; CHECK-NEXT: movq %r8, %rax +; CHECK-NEXT: shrdq $24, %r8, %rcx +; CHECK-NEXT: movq %rcx, %r8 ; CHECK-NEXT: shrdq $24, %r12, %rax ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrdq $24, %rdi, %r12 -; CHECK-NEXT: shrdq $24, %rsi, %rdi +; CHECK-NEXT: shrdq $24, %rdx, %rdi ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-NEXT: movq %rdi, 112(%rax) ; CHECK-NEXT: movq %r12, 104(%rax) @@ -362,10 +362,10 @@ ; CHECK-NEXT: movq %rcx, 16(%rax) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: movq %rcx, 8(%rax) -; CHECK-NEXT: movq %rdx, (%rax) -; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: shrq $56, %rsi -; CHECK-NEXT: movb %sil, 124(%rax) +; CHECK-NEXT: movq %rsi, (%rax) +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: shrq $56, %rdx +; CHECK-NEXT: movb %dl, 124(%rax) ; CHECK-NEXT: shrq $24, %rcx ; CHECK-NEXT: movl %ecx, 120(%rax) ; CHECK-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/pr45563-2.ll b/llvm/test/CodeGen/X86/pr45563-2.ll --- a/llvm/test/CodeGen/X86/pr45563-2.ll +++ b/llvm/test/CodeGen/X86/pr45563-2.ll @@ -203,8 +203,8 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: vmovd %esi, %xmm3 ; CHECK-NEXT: vpinsrb $2, %edx, %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3 @@ -218,7 +218,7 @@ ; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 ; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; CHECK-NEXT: vmaskmovps (%r10), %ymm3, %ymm4 +; CHECK-NEXT: vmaskmovps (%rdi), %ymm3, %ymm4 ; CHECK-NEXT: vblendvps %ymm3, %ymm4, %ymm2, %ymm2 ; CHECK-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 @@ -233,12 +233,12 @@ ; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 ; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; CHECK-NEXT: vmaskmovps 32(%r10), %ymm3, %ymm4 +; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm3, %ymm4 ; CHECK-NEXT: vblendvps %ymm3, %ymm4, %ymm1, %ymm1 -; CHECK-NEXT: vmovd %edi, %xmm3 +; CHECK-NEXT: vmovd %r10d, %xmm3 ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vmaskmovps 64(%r10), %ymm3, %ymm4 +; CHECK-NEXT: vmaskmovps 64(%rdi), %ymm3, %ymm4 ; CHECK-NEXT: vblendvps %xmm3, %xmm4, %xmm0, %xmm0 ; CHECK-NEXT: vmovss %xmm0, 64(%rax) ; CHECK-NEXT: vmovaps %ymm1, 32(%rax) @@ -276,8 +276,8 @@ ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: vmovd %esi, %xmm4 ; CHECK-NEXT: vpinsrb $2, %edx, %xmm4, %xmm4 ; CHECK-NEXT: vpinsrb $4, %ecx, %xmm4, %xmm4 @@ -291,7 +291,7 @@ ; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; CHECK-NEXT: vmaskmovps (%r10), %ymm4, %ymm5 +; CHECK-NEXT: vmaskmovps (%rdi), %ymm4, %ymm5 ; CHECK-NEXT: vblendvps %ymm4, %ymm5, %ymm3, %ymm3 ; CHECK-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4 @@ -306,9 +306,9 @@ ; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; CHECK-NEXT: vmaskmovps 32(%r10), %ymm4, %ymm5 +; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm4, %ymm5 ; CHECK-NEXT: vblendvps %ymm4, %ymm5, %ymm2, %ymm2 -; CHECK-NEXT: vmovd %edi, %xmm4 +; CHECK-NEXT: vmovd %r10d, %xmm4 ; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4 ; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm4, %xmm4 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm4, %xmm4 @@ -320,7 +320,7 @@ ; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6 -; CHECK-NEXT: vmaskmovps 64(%r10), %ymm6, %ymm6 +; CHECK-NEXT: vmaskmovps 64(%rdi), %ymm6, %ymm6 ; CHECK-NEXT: vmovaps %ymm2, 32(%rax) ; CHECK-NEXT: vextractf128 $1, %ymm6, %xmm2 ; CHECK-NEXT: vblendvps %xmm4, %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/pr45563.ll b/llvm/test/CodeGen/X86/pr45563.ll --- a/llvm/test/CodeGen/X86/pr45563.ll +++ b/llvm/test/CodeGen/X86/pr45563.ll @@ -26,33 +26,33 @@ ; CHECK-NEXT: vmovdqa 128(%rbp), %xmm10 ; CHECK-NEXT: vpcmpgtq %xmm8, %xmm10, %xmm8 ; CHECK-NEXT: vpcmpgtq %xmm7, %xmm9, %xmm7 -; CHECK-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm8 -; CHECK-NEXT: vextractf128 $1, %ymm6, %xmm10 +; CHECK-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; CHECK-NEXT: vextractf128 $1, %ymm6, %xmm8 ; CHECK-NEXT: vmovdqa 80(%rbp), %xmm9 -; CHECK-NEXT: vmovdqa 96(%rbp), %xmm7 -; CHECK-NEXT: vpcmpgtq %xmm10, %xmm7, %xmm7 +; CHECK-NEXT: vmovdqa 96(%rbp), %xmm10 +; CHECK-NEXT: vpcmpgtq %xmm8, %xmm10, %xmm8 ; CHECK-NEXT: vpcmpgtq %xmm6, %xmm9, %xmm6 -; CHECK-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm10 -; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm7 +; CHECK-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm8 ; CHECK-NEXT: vmovdqa 48(%rbp), %xmm9 -; CHECK-NEXT: vmovdqa 64(%rbp), %xmm6 -; CHECK-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm6 +; CHECK-NEXT: vmovdqa 64(%rbp), %xmm10 +; CHECK-NEXT: vpcmpgtq %xmm8, %xmm10, %xmm8 ; CHECK-NEXT: vpcmpgtq %xmm5, %xmm9, %xmm5 -; CHECK-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; CHECK-NEXT: vextractf128 $1, %ymm4, %xmm6 +; CHECK-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; CHECK-NEXT: vextractf128 $1, %ymm4, %xmm8 ; CHECK-NEXT: vmovdqa 16(%rbp), %xmm9 -; CHECK-NEXT: vmovdqa 32(%rbp), %xmm7 -; CHECK-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6 +; CHECK-NEXT: vmovdqa 32(%rbp), %xmm10 +; CHECK-NEXT: vpcmpgtq %xmm8, %xmm10, %xmm8 ; CHECK-NEXT: vpcmpgtq %xmm4, %xmm9, %xmm4 -; CHECK-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; CHECK-NEXT: vmaskmovpd (%rdi), %ymm4, %ymm6 -; CHECK-NEXT: vblendvpd %ymm4, %ymm6, %ymm0, %ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; CHECK-NEXT: vmaskmovpd (%rdi), %ymm4, %ymm8 +; CHECK-NEXT: vblendvpd %ymm4, %ymm8, %ymm0, %ymm0 ; CHECK-NEXT: vmaskmovpd 32(%rdi), %ymm5, %ymm4 ; CHECK-NEXT: vblendvpd %ymm5, %ymm4, %ymm1, %ymm1 -; CHECK-NEXT: vmaskmovpd 64(%rdi), %ymm10, %ymm4 -; CHECK-NEXT: vblendvpd %ymm10, %ymm4, %ymm2, %ymm2 -; CHECK-NEXT: vmaskmovpd 96(%rdi), %ymm8, %ymm4 -; CHECK-NEXT: vblendvpd %ymm8, %ymm4, %ymm3, %ymm3 +; CHECK-NEXT: vmaskmovpd 64(%rdi), %ymm6, %ymm4 +; CHECK-NEXT: vblendvpd %ymm6, %ymm4, %ymm2, %ymm2 +; CHECK-NEXT: vmaskmovpd 96(%rdi), %ymm7, %ymm4 +; CHECK-NEXT: vblendvpd %ymm7, %ymm4, %ymm3, %ymm3 ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 diff --git a/llvm/test/CodeGen/X86/pr45995.ll b/llvm/test/CodeGen/X86/pr45995.ll --- a/llvm/test/CodeGen/X86/pr45995.ll +++ b/llvm/test/CodeGen/X86/pr45995.ll @@ -15,21 +15,21 @@ ; CHECK-NEXT: .cfi_offset rbp, -16 ; CHECK-NEXT: vpslld xmm0, xmm0, 31 ; CHECK-NEXT: vmovmskps edi, xmm0 +; CHECK-NEXT: mov ebx, edi +; CHECK-NEXT: shr bl, 3 ; CHECK-NEXT: mov ebp, edi -; CHECK-NEXT: shr bpl, 3 +; CHECK-NEXT: and bpl, 4 +; CHECK-NEXT: shr bpl, 2 ; CHECK-NEXT: mov r14d, edi -; CHECK-NEXT: and r14b, 4 -; CHECK-NEXT: shr r14b, 2 -; CHECK-NEXT: mov ebx, edi -; CHECK-NEXT: and bl, 2 -; CHECK-NEXT: shr bl -; CHECK-NEXT: call print_i1@PLT -; CHECK-NEXT: movzx edi, bl +; CHECK-NEXT: and r14b, 2 +; CHECK-NEXT: shr r14b ; CHECK-NEXT: call print_i1@PLT ; CHECK-NEXT: movzx edi, r14b ; CHECK-NEXT: call print_i1@PLT ; CHECK-NEXT: movzx edi, bpl ; CHECK-NEXT: call print_i1@PLT +; CHECK-NEXT: movzx edi, bl +; CHECK-NEXT: call print_i1@PLT ; CHECK-NEXT: pop rbx ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: pop r14 @@ -73,39 +73,39 @@ ; CHECK-NEXT: .cfi_offset r15, -24 ; CHECK-NEXT: .cfi_offset rbp, -16 ; CHECK-NEXT: vpslld xmm1, xmm1, 31 -; CHECK-NEXT: vmovmskps ebp, xmm1 -; CHECK-NEXT: mov eax, ebp +; CHECK-NEXT: vmovmskps ebx, xmm1 +; CHECK-NEXT: mov eax, ebx ; CHECK-NEXT: shr al, 3 ; CHECK-NEXT: mov byte ptr [rsp + 7], al # 1-byte Spill -; CHECK-NEXT: mov r15d, ebp -; CHECK-NEXT: and r15b, 4 -; CHECK-NEXT: shr r15b, 2 -; CHECK-NEXT: mov r13d, ebp -; CHECK-NEXT: and r13b, 2 -; CHECK-NEXT: shr r13b +; CHECK-NEXT: mov r14d, ebx +; CHECK-NEXT: and r14b, 4 +; CHECK-NEXT: shr r14b, 2 +; CHECK-NEXT: mov r15d, ebx +; CHECK-NEXT: and r15b, 2 +; CHECK-NEXT: shr r15b ; CHECK-NEXT: vpslld xmm0, xmm0, 31 ; CHECK-NEXT: vmovmskps edi, xmm0 ; CHECK-NEXT: mov r12d, edi ; CHECK-NEXT: shr r12b, 3 -; CHECK-NEXT: mov ebx, edi -; CHECK-NEXT: and bl, 4 -; CHECK-NEXT: shr bl, 2 -; CHECK-NEXT: mov r14d, edi -; CHECK-NEXT: and r14b, 2 -; CHECK-NEXT: shr r14b +; CHECK-NEXT: mov r13d, edi +; CHECK-NEXT: and r13b, 4 +; CHECK-NEXT: shr r13b, 2 +; CHECK-NEXT: mov ebp, edi +; CHECK-NEXT: and bpl, 2 +; CHECK-NEXT: shr bpl ; CHECK-NEXT: call print_i1@PLT -; CHECK-NEXT: movzx edi, r14b +; CHECK-NEXT: movzx edi, bpl ; CHECK-NEXT: call print_i1@PLT -; CHECK-NEXT: movzx edi, bl +; CHECK-NEXT: movzx edi, r13b ; CHECK-NEXT: call print_i1@PLT ; CHECK-NEXT: movzx edi, r12b ; CHECK-NEXT: call print_i1@PLT -; CHECK-NEXT: mov edi, ebp -; CHECK-NEXT: call print_i1@PLT -; CHECK-NEXT: movzx edi, r13b +; CHECK-NEXT: mov edi, ebx ; CHECK-NEXT: call print_i1@PLT ; CHECK-NEXT: movzx edi, r15b ; CHECK-NEXT: call print_i1@PLT +; CHECK-NEXT: movzx edi, r14b +; CHECK-NEXT: call print_i1@PLT ; CHECK-NEXT: movzx edi, byte ptr [rsp + 7] # 1-byte Folded Reload ; CHECK-NEXT: call print_i1@PLT ; CHECK-NEXT: add rsp, 8 diff --git a/llvm/test/CodeGen/X86/pr46877.ll b/llvm/test/CodeGen/X86/pr46877.ll --- a/llvm/test/CodeGen/X86/pr46877.ll +++ b/llvm/test/CodeGen/X86/pr46877.ll @@ -7,20 +7,20 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovaps %xmm3, %xmm15 ; CHECK-NEXT: vmovss {{.*#+}} xmm14 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero ; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm12 -; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm3 -; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm3 = (xmm15 * xmm3) - xmm0 +; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm10 +; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm10 = (xmm3 * xmm10) - xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0 ; CHECK-NEXT: vmulss %xmm5, %xmm4, %xmm2 -; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm3 +; CHECK-NEXT: vmulss %xmm2, %xmm10, %xmm4 ; CHECK-NEXT: vmulss %xmm6, %xmm12, %xmm2 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm7 * xmm2) + xmm0 -; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm5 +; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm5 ; CHECK-NEXT: vmulss %xmm0, %xmm13, %xmm2 ; CHECK-NEXT: vmovss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: vmulss %xmm2, %xmm10, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm9, %xmm2 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm3 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * mem) + xmm0 @@ -34,12 +34,13 @@ ; CHECK-NEXT: vmulss %xmm5, %xmm2, %xmm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm5 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm10 * xmm5) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm9 * xmm5) + xmm0 ; CHECK-NEXT: vmulss %xmm5, %xmm4, %xmm4 -; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss %xmm0, %xmm9, %xmm6 -; CHECK-NEXT: vmovss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: vmulss %xmm6, %xmm14, %xmm5 +; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm8 +; CHECK-NEXT: vmovss %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: vmovaps %xmm5, %xmm10 +; CHECK-NEXT: vmulss %xmm14, %xmm8, %xmm5 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0 ; CHECK-NEXT: vmulss %xmm5, %xmm2, %xmm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero @@ -66,7 +67,7 @@ ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm4 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm4 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 @@ -78,44 +79,43 @@ ; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm1 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm1 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm1 * xmm4) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm10 -; CHECK-NEXT: vmulss %xmm0, %xmm12, %xmm6 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm4 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm13, %xmm5 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm7 * xmm5) + xmm0 -; CHECK-NEXT: vmulss %xmm5, %xmm4, %xmm4 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm5 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm5 -; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm12 -; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm7 * xmm5) + xmm0 +; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm4 +; CHECK-NEXT: vmulss %xmm0, %xmm12, %xmm5 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm10 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm10 = -(xmm10 * mem) + xmm0 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm13, %xmm12 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm12 = -(xmm7 * xmm12) + xmm0 +; CHECK-NEXT: vmulss %xmm12, %xmm10, %xmm10 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vmulss %xmm4, %xmm10, %xmm12 +; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm7 * xmm2) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss %xmm6, %xmm3, %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm10 * xmm2) + xmm0 +; CHECK-NEXT: vmulss %xmm5, %xmm3, %xmm6 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm9 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm1 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm5, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm5 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm6, %xmm2, %xmm2 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm6 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 ; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; CHECK-NEXT: vmulss %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vmulss %xmm6, %xmm1, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm2 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm13 * xmm2) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm12, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm4 -; CHECK-NEXT: vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm3 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm13 * xmm3) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm3 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm12 * xmm3) + xmm0 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm2 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 ; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero @@ -127,11 +127,11 @@ ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Reload -; CHECK-NEXT: # xmm12 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm12, %xmm2 +; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Reload +; CHECK-NEXT: # xmm10 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm2 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm7 = -(xmm7 * mem) + xmm0 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm13 * xmm2) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm12 * xmm2) + xmm0 ; CHECK-NEXT: vmulss %xmm7, %xmm2, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 @@ -142,63 +142,63 @@ ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm15 * xmm2) + xmm0 ; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm2 +; CHECK-NEXT: vmulss %xmm0, %xmm6, %xmm2 ; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm10 * xmm2) + xmm0 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm5 * xmm3) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm6 * xmm3) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm8 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm3 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm4 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 ; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm10 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm5 * xmm11) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm6 * xmm11) + xmm0 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm2 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm15 * xmm2) + xmm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm4 +; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm11, %xmm2 ; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm2 ; CHECK-NEXT: vfnmadd132ss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 4-byte Folded Reload ; CHECK-NEXT: # xmm14 = -(xmm14 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm14, %xmm9 +; CHECK-NEXT: vmulss %xmm2, %xmm14, %xmm4 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm11 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm11 = -(xmm11 * mem) + xmm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm7 -; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 4-byte Folded Reload -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm1 -; CHECK-NEXT: vmulss %xmm6, %xmm15, %xmm6 -; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm6 = -(xmm3 * xmm6) + xmm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm4 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm3 * xmm4) + xmm0 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm7 = -(xmm3 * xmm7) + xmm0 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm3 * xmm5) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm12, %xmm2 -; CHECK-NEXT: vmulss %xmm0, %xmm13, %xmm3 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss %xmm6, %xmm13, %xmm7 +; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 4-byte Folded Reload +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm8 +; CHECK-NEXT: vmulss %xmm5, %xmm15, %xmm5 +; CHECK-NEXT: vmovss {{.*#+}} xmm11 = mem[0],zero,zero,zero +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm11 * xmm5) + xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm9 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm9 = -(xmm11 * xmm9) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm7 = -(xmm11 * xmm7) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm6 = -(xmm11 * xmm6) + xmm0 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm10 +; CHECK-NEXT: vmulss %xmm0, %xmm12, %xmm11 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm11, %xmm11 ; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm12 * xmm3) + xmm0 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm12 * xmm2) + xmm0 -; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm15 * xmm1) - xmm0 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm8 = -(xmm8 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm8, %xmm9, %xmm0 -; CHECK-NEXT: vmulss %xmm6, %xmm0, %xmm0 -; CHECK-NEXT: vmulss %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm12 * xmm11) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm10 = -(xmm12 * xmm10) + xmm0 +; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm8 = (xmm15 * xmm8) - xmm0 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vmulss %xmm5, %xmm0, %xmm0 +; CHECK-NEXT: vmulss %xmm0, %xmm9, %xmm0 ; CHECK-NEXT: vmulss %xmm7, %xmm0, %xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm4 -; CHECK-NEXT: vmulss %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vmulss %xmm5, %xmm11, %xmm4 -; CHECK-NEXT: vmulss %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vmulss %xmm6, %xmm2, %xmm1 +; CHECK-NEXT: vmulss %xmm1, %xmm11, %xmm1 +; CHECK-NEXT: vmulss %xmm1, %xmm10, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vmulss %xmm1, %xmm8, %xmm1 ; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovss %xmm0, (%rdi) ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr47299.ll b/llvm/test/CodeGen/X86/pr47299.ll --- a/llvm/test/CodeGen/X86/pr47299.ll +++ b/llvm/test/CodeGen/X86/pr47299.ll @@ -15,38 +15,38 @@ ; CHECK-NEXT: vpbroadcastq zmm0, rsi ; CHECK-NEXT: vpcmpnleuq k0, zmm0, zmmword ptr [rip + {{\.?LCPI[0-9]+_[0-9]+}}] ; CHECK-NEXT: kshiftrb k1, k0, 6 -; CHECK-NEXT: kmovd r8d, k1 +; CHECK-NEXT: kmovd ecx, k1 ; CHECK-NEXT: kshiftrb k1, k0, 5 -; CHECK-NEXT: kmovd r9d, k1 +; CHECK-NEXT: kmovd edx, k1 ; CHECK-NEXT: kshiftrb k1, k0, 4 -; CHECK-NEXT: kmovd r10d, k1 +; CHECK-NEXT: kmovd esi, k1 ; CHECK-NEXT: kshiftrb k1, k0, 3 ; CHECK-NEXT: kmovd edi, k1 ; CHECK-NEXT: kshiftrb k1, k0, 2 -; CHECK-NEXT: kmovd ecx, k1 +; CHECK-NEXT: kmovd r8d, k1 ; CHECK-NEXT: kshiftrb k1, k0, 1 -; CHECK-NEXT: kmovd edx, k1 -; CHECK-NEXT: kmovd esi, k0 -; CHECK-NEXT: and sil, 1 -; CHECK-NEXT: and dl, 1 -; CHECK-NEXT: add dl, dl -; CHECK-NEXT: or dl, sil -; CHECK-NEXT: and cl, 1 -; CHECK-NEXT: shl cl, 2 -; CHECK-NEXT: or cl, dl -; CHECK-NEXT: and dil, 1 -; CHECK-NEXT: shl dil, 3 -; CHECK-NEXT: or dil, cl +; CHECK-NEXT: kmovd r9d, k1 +; CHECK-NEXT: kmovd r10d, k0 ; CHECK-NEXT: and r10b, 1 -; CHECK-NEXT: shl r10b, 4 -; CHECK-NEXT: or r10b, dil ; CHECK-NEXT: and r9b, 1 -; CHECK-NEXT: shl r9b, 5 +; CHECK-NEXT: add r9b, r9b ; CHECK-NEXT: or r9b, r10b -; CHECK-NEXT: shl r8b, 6 +; CHECK-NEXT: and r8b, 1 +; CHECK-NEXT: shl r8b, 2 ; CHECK-NEXT: or r8b, r9b -; CHECK-NEXT: and r8b, 127 -; CHECK-NEXT: mov byte ptr [rax], r8b +; CHECK-NEXT: and dil, 1 +; CHECK-NEXT: shl dil, 3 +; CHECK-NEXT: or dil, r8b +; CHECK-NEXT: and sil, 1 +; CHECK-NEXT: shl sil, 4 +; CHECK-NEXT: or sil, dil +; CHECK-NEXT: and dl, 1 +; CHECK-NEXT: shl dl, 5 +; CHECK-NEXT: or dl, sil +; CHECK-NEXT: shl cl, 6 +; CHECK-NEXT: or cl, dl +; CHECK-NEXT: and cl, 127 +; CHECK-NEXT: mov byte ptr [rax], cl ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret %2 = call <7 x i1> @llvm.get.active.lane.mask.v7i1.i64(i64 0, i64 %0) diff --git a/llvm/test/CodeGen/X86/pr47857.ll b/llvm/test/CodeGen/X86/pr47857.ll --- a/llvm/test/CodeGen/X86/pr47857.ll +++ b/llvm/test/CodeGen/X86/pr47857.ll @@ -8,29 +8,29 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rdx), %r9 -; CHECK-NEXT: movq 8(%rdx), %r8 +; CHECK-NEXT: movq 8(%rdx), %rcx ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: addq (%rsi), %r9 -; CHECK-NEXT: adcq 8(%rsi), %r8 -; CHECK-NEXT: movq 16(%rdx), %rcx -; CHECK-NEXT: adcq 16(%rsi), %rcx +; CHECK-NEXT: adcq 8(%rsi), %rcx +; CHECK-NEXT: movq 16(%rdx), %r8 +; CHECK-NEXT: adcq 16(%rsi), %r8 ; CHECK-NEXT: movq 24(%rdx), %rdx ; CHECK-NEXT: adcq 24(%rsi), %rdx ; CHECK-NEXT: sbbq %rdi, %rdi ; CHECK-NEXT: andl $38, %edi ; CHECK-NEXT: addq %rdi, %r9 -; CHECK-NEXT: adcq $0, %r8 ; CHECK-NEXT: adcq $0, %rcx +; CHECK-NEXT: adcq $0, %r8 ; CHECK-NEXT: adcq $0, %rdx ; CHECK-NEXT: sbbq %rdi, %rdi ; CHECK-NEXT: andl $38, %edi ; CHECK-NEXT: addq %r9, %rdi -; CHECK-NEXT: adcq $0, %r8 ; CHECK-NEXT: adcq $0, %rcx +; CHECK-NEXT: adcq $0, %r8 ; CHECK-NEXT: adcq $0, %rdx ; CHECK-NEXT: movq %rdi, (%rax) -; CHECK-NEXT: movq %r8, 8(%rax) -; CHECK-NEXT: movq %rcx, 16(%rax) +; CHECK-NEXT: movq %rcx, 8(%rax) +; CHECK-NEXT: movq %r8, 16(%rax) ; CHECK-NEXT: movq %rdx, 24(%rax) ; CHECK-NEXT: retq %4 = load i64, ptr %1, align 8 diff --git a/llvm/test/CodeGen/X86/pr53990-incorrect-machine-sink.ll b/llvm/test/CodeGen/X86/pr53990-incorrect-machine-sink.ll --- a/llvm/test/CodeGen/X86/pr53990-incorrect-machine-sink.ll +++ b/llvm/test/CodeGen/X86/pr53990-incorrect-machine-sink.ll @@ -10,17 +10,17 @@ ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movq %rdx, %rbx -; CHECK-NEXT: movl %edi, %r14d -; CHECK-NEXT: movq (%rsi), %rbp +; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movq (%rsi), %r14 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: jmpq *.LJTI0_0(,%rax,8) ; CHECK-NEXT: .LBB0_1: # %split.3 -; CHECK-NEXT: testb $1, %r14b +; CHECK-NEXT: testb $1, %bpl ; CHECK-NEXT: je .LBB0_3 ; CHECK-NEXT: # %bb.2: # %clobber ; CHECK-NEXT: callq clobber@PLT ; CHECK-NEXT: .LBB0_3: # %sink -; CHECK-NEXT: movq %rbp, (%rbx) +; CHECK-NEXT: movq %r14, (%rbx) ; CHECK-NEXT: .LBB0_4: # %latch ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/promote-cmp.ll b/llvm/test/CodeGen/X86/promote-cmp.ll --- a/llvm/test/CodeGen/X86/promote-cmp.ll +++ b/llvm/test/CodeGen/X86/promote-cmp.ll @@ -8,34 +8,34 @@ ; SSE2-LABEL: PR45808: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm3, %xmm9 -; SSE2-NEXT: pxor %xmm4, %xmm9 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm8 -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm4, %xmm7 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: pxor %xmm4, %xmm8 ; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm8[0,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm7[0,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm6[1,3] ; SSE2-NEXT: andps %xmm10, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm8[1,3] -; SSE2-NEXT: orps %xmm4, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm7[1,3] +; SSE2-NEXT: orps %xmm4, %xmm9 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE2-NEXT: pxor %xmm9, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,1,3,3] ; SSE2-NEXT: psllq $63, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -820,41 +820,41 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; SSE2OR3-LABEL: test14: ; SSE2OR3: # %bb.0: # %vector.ph -; SSE2OR3-NEXT: pxor %xmm8, %xmm8 +; SSE2OR3-NEXT: pxor %xmm5, %xmm5 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm6 -; SSE2OR3-NEXT: movdqa %xmm4, %xmm9 -; SSE2OR3-NEXT: movdqa %xmm3, %xmm10 -; SSE2OR3-NEXT: movdqa %xmm2, %xmm7 -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2OR3-NEXT: pand %xmm5, %xmm4 -; SSE2OR3-NEXT: pand %xmm5, %xmm3 +; SSE2OR3-NEXT: movdqa %xmm4, %xmm7 +; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 +; SSE2OR3-NEXT: movdqa %xmm2, %xmm9 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2OR3-NEXT: pand %xmm10, %xmm4 +; SSE2OR3-NEXT: pand %xmm10, %xmm3 ; SSE2OR3-NEXT: packuswb %xmm4, %xmm3 ; SSE2OR3-NEXT: movdqa %xmm1, %xmm4 -; SSE2OR3-NEXT: pand %xmm5, %xmm2 -; SSE2OR3-NEXT: pand %xmm5, %xmm1 +; SSE2OR3-NEXT: pand %xmm10, %xmm2 +; SSE2OR3-NEXT: pand %xmm10, %xmm1 ; SSE2OR3-NEXT: packuswb %xmm2, %xmm1 ; SSE2OR3-NEXT: packuswb %xmm3, %xmm1 ; SSE2OR3-NEXT: psubb %xmm0, %xmm1 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm2 -; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE2OR3-NEXT: movdqa %xmm2, %xmm0 -; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] ; SSE2OR3-NEXT: movdqa %xmm6, %xmm3 -; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2OR3-NEXT: pxor %xmm5, %xmm9 +; SSE2OR3-NEXT: pxor %xmm5, %xmm7 ; SSE2OR3-NEXT: por %xmm5, %xmm6 -; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm6 -; SSE2OR3-NEXT: pxor %xmm5, %xmm10 +; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6 +; SSE2OR3-NEXT: pxor %xmm5, %xmm8 ; SSE2OR3-NEXT: por %xmm5, %xmm3 -; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm3 ; SSE2OR3-NEXT: packssdw %xmm6, %xmm3 -; SSE2OR3-NEXT: pxor %xmm5, %xmm7 +; SSE2OR3-NEXT: pxor %xmm5, %xmm9 ; SSE2OR3-NEXT: por %xmm5, %xmm2 -; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm2 +; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm2 ; SSE2OR3-NEXT: pxor %xmm5, %xmm4 ; SSE2OR3-NEXT: por %xmm5, %xmm0 ; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm0 @@ -866,27 +866,27 @@ ; SSE41-LABEL: test14: ; SSE41: # %bb.0: # %vector.ph ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; SSE41-NEXT: pmaxud %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero +; SSE41-NEXT: pmaxud %xmm4, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE41-NEXT: pxor %xmm9, %xmm6 +; SSE41-NEXT: pxor %xmm9, %xmm8 ; SSE41-NEXT: pmaxud %xmm3, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 ; SSE41-NEXT: pxor %xmm9, %xmm7 -; SSE41-NEXT: packssdw %xmm6, %xmm7 +; SSE41-NEXT: packssdw %xmm8, %xmm7 ; SSE41-NEXT: pmaxud %xmm1, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm5 ; SSE41-NEXT: pxor %xmm9, %xmm5 -; SSE41-NEXT: pmaxud %xmm2, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm8 -; SSE41-NEXT: pxor %xmm9, %xmm8 -; SSE41-NEXT: packssdw %xmm8, %xmm5 +; SSE41-NEXT: pmaxud %xmm2, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE41-NEXT: pxor %xmm9, %xmm6 +; SSE41-NEXT: packssdw %xmm6, %xmm5 ; SSE41-NEXT: packsswb %xmm7, %xmm5 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE41-NEXT: pand %xmm6, %xmm4 @@ -1608,66 +1608,66 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE2OR3-LABEL: psubus_8i64_max: ; SSE2OR3: # %bb.0: # %vector.ph -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE2OR3-NEXT: movdqa %xmm2, %xmm7 -; SSE2OR3-NEXT: pxor %xmm8, %xmm7 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991] -; SSE2OR3-NEXT: movdqa %xmm9, %xmm6 -; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2OR3-NEXT: pand %xmm5, %xmm7 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2OR3-NEXT: por %xmm7, %xmm5 -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535] -; SSE2OR3-NEXT: pand %xmm5, %xmm2 -; SSE2OR3-NEXT: pandn %xmm10, %xmm5 -; SSE2OR3-NEXT: por %xmm2, %xmm5 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] +; SSE2OR3-NEXT: pxor %xmm5, %xmm7 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] +; SSE2OR3-NEXT: movdqa %xmm6, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2] +; SSE2OR3-NEXT: pand %xmm8, %xmm7 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3] +; SSE2OR3-NEXT: por %xmm7, %xmm8 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535] +; SSE2OR3-NEXT: pand %xmm8, %xmm2 +; SSE2OR3-NEXT: pandn %xmm7, %xmm8 +; SSE2OR3-NEXT: por %xmm2, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] ; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE2OR3-NEXT: movdqa %xmm1, %xmm5 -; SSE2OR3-NEXT: pxor %xmm8, %xmm5 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2OR3-NEXT: movdqa %xmm9, %xmm7 -; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] -; SSE2OR3-NEXT: pand %xmm6, %xmm5 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2OR3-NEXT: por %xmm5, %xmm6 -; SSE2OR3-NEXT: pand %xmm6, %xmm1 -; SSE2OR3-NEXT: pandn %xmm10, %xmm6 -; SSE2OR3-NEXT: por %xmm1, %xmm6 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] +; SSE2OR3-NEXT: movdqa %xmm1, %xmm8 +; SSE2OR3-NEXT: pxor %xmm5, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm9 +; SSE2OR3-NEXT: movdqa %xmm6, %xmm10 +; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,0,2,2] +; SSE2OR3-NEXT: pand %xmm9, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2OR3-NEXT: por %xmm8, %xmm9 +; SSE2OR3-NEXT: pand %xmm9, %xmm1 +; SSE2OR3-NEXT: pandn %xmm7, %xmm9 +; SSE2OR3-NEXT: por %xmm1, %xmm9 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] ; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2OR3-NEXT: movdqa %xmm4, %xmm2 -; SSE2OR3-NEXT: pxor %xmm8, %xmm2 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE2OR3-NEXT: movdqa %xmm9, %xmm6 -; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] -; SSE2OR3-NEXT: pand %xmm5, %xmm2 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2OR3-NEXT: por %xmm2, %xmm5 -; SSE2OR3-NEXT: pand %xmm5, %xmm4 -; SSE2OR3-NEXT: pandn %xmm10, %xmm5 -; SSE2OR3-NEXT: por %xmm4, %xmm5 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] +; SSE2OR3-NEXT: pxor %xmm5, %xmm2 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3] +; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE2OR3-NEXT: movdqa %xmm6, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm9 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,2,2] +; SSE2OR3-NEXT: pand %xmm8, %xmm2 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3] +; SSE2OR3-NEXT: por %xmm2, %xmm8 +; SSE2OR3-NEXT: pand %xmm8, %xmm4 +; SSE2OR3-NEXT: pandn %xmm7, %xmm8 +; SSE2OR3-NEXT: por %xmm4, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] ; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] ; SSE2OR3-NEXT: movdqa %xmm3, %xmm4 -; SSE2OR3-NEXT: pxor %xmm8, %xmm4 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,0,2,2] -; SSE2OR3-NEXT: pand %xmm5, %xmm4 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2OR3-NEXT: pxor %xmm5, %xmm4 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] +; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] +; SSE2OR3-NEXT: pand %xmm8, %xmm4 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2OR3-NEXT: por %xmm4, %xmm5 ; SSE2OR3-NEXT: pand %xmm5, %xmm3 -; SSE2OR3-NEXT: pandn %xmm10, %xmm5 +; SSE2OR3-NEXT: pandn %xmm7, %xmm5 ; SSE2OR3-NEXT: por %xmm3, %xmm5 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] ; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] @@ -1678,56 +1678,56 @@ ; ; SSE41-LABEL: psubus_8i64_max: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm6, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: movdqa %xmm5, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm6 = [65535,65535] -; SSE41-NEXT: movapd %xmm6, %xmm10 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm10 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535] +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm5, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: movdqa %xmm6, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm6, %xmm4 +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 -; SSE41-NEXT: packusdw %xmm10, %xmm4 +; SSE41-NEXT: packusdw %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm5, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm6, %xmm3 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pxor %xmm1, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: packusdw %xmm3, %xmm6 -; SSE41-NEXT: packusdw %xmm4, %xmm6 -; SSE41-NEXT: psubusw %xmm6, %xmm8 -; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: packusdw %xmm3, %xmm7 +; SSE41-NEXT: packusdw %xmm4, %xmm7 +; SSE41-NEXT: psubusw %xmm7, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: psubus_8i64_max: @@ -1793,47 +1793,47 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { ; SSE2OR3-LABEL: psubus_16i32_max: ; SSE2OR3: # %bb.0: # %vector.ph -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] ; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 -; SSE2OR3-NEXT: pxor %xmm9, %xmm8 -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183] -; SSE2OR3-NEXT: movdqa %xmm10, %xmm6 -; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm6 +; SSE2OR3-NEXT: pxor %xmm7, %xmm8 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] +; SSE2OR3-NEXT: movdqa %xmm6, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9 ; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2OR3-NEXT: pand %xmm6, %xmm3 -; SSE2OR3-NEXT: pxor %xmm8, %xmm6 -; SSE2OR3-NEXT: por %xmm3, %xmm6 -; SSE2OR3-NEXT: pslld $16, %xmm6 -; SSE2OR3-NEXT: psrad $16, %xmm6 +; SSE2OR3-NEXT: pand %xmm9, %xmm3 +; SSE2OR3-NEXT: pxor %xmm8, %xmm9 +; SSE2OR3-NEXT: por %xmm3, %xmm9 +; SSE2OR3-NEXT: pslld $16, %xmm9 +; SSE2OR3-NEXT: psrad $16, %xmm9 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 -; SSE2OR3-NEXT: pxor %xmm9, %xmm3 -; SSE2OR3-NEXT: movdqa %xmm10, %xmm7 -; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2OR3-NEXT: pand %xmm7, %xmm2 -; SSE2OR3-NEXT: pxor %xmm8, %xmm7 -; SSE2OR3-NEXT: por %xmm2, %xmm7 -; SSE2OR3-NEXT: pslld $16, %xmm7 -; SSE2OR3-NEXT: psrad $16, %xmm7 -; SSE2OR3-NEXT: packssdw %xmm6, %xmm7 -; SSE2OR3-NEXT: psubusw %xmm7, %xmm0 +; SSE2OR3-NEXT: pxor %xmm7, %xmm3 +; SSE2OR3-NEXT: movdqa %xmm6, %xmm10 +; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm10 +; SSE2OR3-NEXT: pand %xmm10, %xmm2 +; SSE2OR3-NEXT: pxor %xmm8, %xmm10 +; SSE2OR3-NEXT: por %xmm2, %xmm10 +; SSE2OR3-NEXT: pslld $16, %xmm10 +; SSE2OR3-NEXT: psrad $16, %xmm10 +; SSE2OR3-NEXT: packssdw %xmm9, %xmm10 +; SSE2OR3-NEXT: psubusw %xmm10, %xmm0 ; SSE2OR3-NEXT: movdqa %xmm5, %xmm2 -; SSE2OR3-NEXT: pxor %xmm9, %xmm2 -; SSE2OR3-NEXT: movdqa %xmm10, %xmm3 +; SSE2OR3-NEXT: pxor %xmm7, %xmm2 +; SSE2OR3-NEXT: movdqa %xmm6, %xmm3 ; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm3 ; SSE2OR3-NEXT: pand %xmm3, %xmm5 ; SSE2OR3-NEXT: pxor %xmm8, %xmm3 ; SSE2OR3-NEXT: por %xmm5, %xmm3 ; SSE2OR3-NEXT: pslld $16, %xmm3 ; SSE2OR3-NEXT: psrad $16, %xmm3 -; SSE2OR3-NEXT: pxor %xmm4, %xmm9 -; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2OR3-NEXT: pxor %xmm10, %xmm8 -; SSE2OR3-NEXT: pand %xmm4, %xmm10 -; SSE2OR3-NEXT: por %xmm8, %xmm10 -; SSE2OR3-NEXT: pslld $16, %xmm10 -; SSE2OR3-NEXT: psrad $16, %xmm10 -; SSE2OR3-NEXT: packssdw %xmm3, %xmm10 -; SSE2OR3-NEXT: psubusw %xmm10, %xmm1 +; SSE2OR3-NEXT: pxor %xmm4, %xmm7 +; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6 +; SSE2OR3-NEXT: pxor %xmm6, %xmm8 +; SSE2OR3-NEXT: pand %xmm4, %xmm6 +; SSE2OR3-NEXT: por %xmm8, %xmm6 +; SSE2OR3-NEXT: pslld $16, %xmm6 +; SSE2OR3-NEXT: psrad $16, %xmm6 +; SSE2OR3-NEXT: packssdw %xmm3, %xmm6 +; SSE2OR3-NEXT: psubusw %xmm6, %xmm1 ; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: psubus_16i32_max: @@ -2672,130 +2672,130 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; SSE2OR3-LABEL: test33: ; SSE2OR3: # %bb.0: -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] -; SSE2OR3-NEXT: movdqa %xmm3, %xmm6 -; SSE2OR3-NEXT: pxor %xmm8, %xmm6 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] -; SSE2OR3-NEXT: movdqa %xmm9, %xmm7 -; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2OR3-NEXT: pand %xmm10, %xmm6 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2OR3-NEXT: por %xmm6, %xmm7 -; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm10 -; SSE2OR3-NEXT: pand %xmm7, %xmm3 -; SSE2OR3-NEXT: pxor %xmm10, %xmm7 -; SSE2OR3-NEXT: por %xmm3, %xmm7 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 +; SSE2OR3-NEXT: pxor %xmm6, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm9 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455] +; SSE2OR3-NEXT: movdqa %xmm7, %xmm10 +; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,0,2,2] +; SSE2OR3-NEXT: pand %xmm9, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2OR3-NEXT: por %xmm8, %xmm9 +; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8 +; SSE2OR3-NEXT: pand %xmm9, %xmm3 +; SSE2OR3-NEXT: pxor %xmm8, %xmm9 +; SSE2OR3-NEXT: por %xmm3, %xmm9 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 -; SSE2OR3-NEXT: pxor %xmm8, %xmm3 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm11 -; SSE2OR3-NEXT: movdqa %xmm9, %xmm6 -; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] -; SSE2OR3-NEXT: pand %xmm11, %xmm3 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2OR3-NEXT: por %xmm3, %xmm6 -; SSE2OR3-NEXT: pand %xmm6, %xmm2 -; SSE2OR3-NEXT: pxor %xmm10, %xmm6 -; SSE2OR3-NEXT: por %xmm2, %xmm6 -; SSE2OR3-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] +; SSE2OR3-NEXT: pxor %xmm6, %xmm3 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm10 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm11 +; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,2,2] +; SSE2OR3-NEXT: pand %xmm10, %xmm3 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2OR3-NEXT: por %xmm3, %xmm10 +; SSE2OR3-NEXT: pand %xmm10, %xmm2 +; SSE2OR3-NEXT: pxor %xmm8, %xmm10 +; SSE2OR3-NEXT: por %xmm2, %xmm10 +; SSE2OR3-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,2] ; SSE2OR3-NEXT: movdqa %xmm0, %xmm2 -; SSE2OR3-NEXT: psubd %xmm6, %xmm2 -; SSE2OR3-NEXT: pxor %xmm8, %xmm6 -; SSE2OR3-NEXT: pxor %xmm8, %xmm0 -; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2OR3-NEXT: psubd %xmm10, %xmm2 +; SSE2OR3-NEXT: pxor %xmm6, %xmm10 +; SSE2OR3-NEXT: pxor %xmm6, %xmm0 +; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE2OR3-NEXT: pand %xmm2, %xmm0 ; SSE2OR3-NEXT: movdqa %xmm5, %xmm2 -; SSE2OR3-NEXT: pxor %xmm8, %xmm2 +; SSE2OR3-NEXT: pxor %xmm6, %xmm2 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE2OR3-NEXT: movdqa %xmm9, %xmm6 -; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm9 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,2,2] ; SSE2OR3-NEXT: pand %xmm3, %xmm2 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] ; SSE2OR3-NEXT: por %xmm2, %xmm3 ; SSE2OR3-NEXT: pand %xmm3, %xmm5 -; SSE2OR3-NEXT: pxor %xmm10, %xmm3 +; SSE2OR3-NEXT: pxor %xmm8, %xmm3 ; SSE2OR3-NEXT: por %xmm5, %xmm3 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm2 -; SSE2OR3-NEXT: pxor %xmm8, %xmm2 +; SSE2OR3-NEXT: pxor %xmm6, %xmm2 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm7 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] ; SSE2OR3-NEXT: pand %xmm5, %xmm2 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; SSE2OR3-NEXT: por %xmm2, %xmm5 -; SSE2OR3-NEXT: pxor %xmm5, %xmm10 +; SSE2OR3-NEXT: pxor %xmm5, %xmm8 ; SSE2OR3-NEXT: pand %xmm4, %xmm5 -; SSE2OR3-NEXT: por %xmm10, %xmm5 +; SSE2OR3-NEXT: por %xmm8, %xmm5 ; SSE2OR3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2] ; SSE2OR3-NEXT: movdqa %xmm1, %xmm2 ; SSE2OR3-NEXT: psubd %xmm5, %xmm2 -; SSE2OR3-NEXT: pxor %xmm8, %xmm5 -; SSE2OR3-NEXT: pxor %xmm8, %xmm1 +; SSE2OR3-NEXT: pxor %xmm6, %xmm5 +; SSE2OR3-NEXT: pxor %xmm6, %xmm1 ; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm1 ; SSE2OR3-NEXT: pand %xmm2, %xmm1 ; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: test33: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm0, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm11, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: movdqa %xmm11, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [4294967295,4294967295] -; SSE41-NEXT: movapd %xmm7, %xmm10 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE41-NEXT: movapd %xmm8, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm11, %xmm3 +; SSE41-NEXT: movdqa %xmm7, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm11, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: movdqa %xmm7, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm3 +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2] -; SSE41-NEXT: pmaxud %xmm3, %xmm8 -; SSE41-NEXT: psubd %xmm3, %xmm8 +; SSE41-NEXT: pmaxud %xmm3, %xmm6 +; SSE41-NEXT: psubd %xmm3, %xmm6 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm11, %xmm2 +; SSE41-NEXT: movdqa %xmm7, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm11, %xmm3 +; SSE41-NEXT: movdqa %xmm7, %xmm3 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm2 +; SSE41-NEXT: movapd %xmm8, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 ; SSE41-NEXT: pxor %xmm4, %xmm9 -; SSE41-NEXT: movdqa %xmm11, %xmm3 +; SSE41-NEXT: movdqa %xmm7, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm9, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm11, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 -; SSE41-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[0,2] -; SSE41-NEXT: pmaxud %xmm7, %xmm1 -; SSE41-NEXT: psubd %xmm7, %xmm1 -; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 +; SSE41-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm2[0,2] +; SSE41-NEXT: pmaxud %xmm8, %xmm1 +; SSE41-NEXT: psubd %xmm8, %xmm1 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test33: @@ -2903,133 +2903,133 @@ ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1] ; SSE2OR3-NEXT: pand %xmm6, %xmm1 ; SSE2OR3-NEXT: pand %xmm6, %xmm0 -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] -; SSE2OR3-NEXT: movdqa %xmm3, %xmm6 -; SSE2OR3-NEXT: pxor %xmm8, %xmm6 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] -; SSE2OR3-NEXT: movdqa %xmm9, %xmm7 -; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2OR3-NEXT: pand %xmm10, %xmm6 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2OR3-NEXT: por %xmm6, %xmm7 -; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm10 -; SSE2OR3-NEXT: pand %xmm7, %xmm3 -; SSE2OR3-NEXT: pxor %xmm10, %xmm7 -; SSE2OR3-NEXT: por %xmm3, %xmm7 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 +; SSE2OR3-NEXT: pxor %xmm6, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm9 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455] +; SSE2OR3-NEXT: movdqa %xmm7, %xmm10 +; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,0,2,2] +; SSE2OR3-NEXT: pand %xmm9, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2OR3-NEXT: por %xmm8, %xmm9 +; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8 +; SSE2OR3-NEXT: pand %xmm9, %xmm3 +; SSE2OR3-NEXT: pxor %xmm8, %xmm9 +; SSE2OR3-NEXT: por %xmm3, %xmm9 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 -; SSE2OR3-NEXT: pxor %xmm8, %xmm3 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm11 -; SSE2OR3-NEXT: movdqa %xmm9, %xmm6 -; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] -; SSE2OR3-NEXT: pand %xmm11, %xmm3 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2OR3-NEXT: por %xmm3, %xmm6 -; SSE2OR3-NEXT: pand %xmm6, %xmm2 -; SSE2OR3-NEXT: pxor %xmm10, %xmm6 -; SSE2OR3-NEXT: por %xmm2, %xmm6 -; SSE2OR3-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] +; SSE2OR3-NEXT: pxor %xmm6, %xmm3 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm10 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm11 +; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,2,2] +; SSE2OR3-NEXT: pand %xmm10, %xmm3 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2OR3-NEXT: por %xmm3, %xmm10 +; SSE2OR3-NEXT: pand %xmm10, %xmm2 +; SSE2OR3-NEXT: pxor %xmm8, %xmm10 +; SSE2OR3-NEXT: por %xmm2, %xmm10 +; SSE2OR3-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,2] ; SSE2OR3-NEXT: movdqa %xmm0, %xmm2 -; SSE2OR3-NEXT: psubd %xmm6, %xmm2 -; SSE2OR3-NEXT: pxor %xmm8, %xmm6 -; SSE2OR3-NEXT: por %xmm8, %xmm0 -; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2OR3-NEXT: psubd %xmm10, %xmm2 +; SSE2OR3-NEXT: pxor %xmm6, %xmm10 +; SSE2OR3-NEXT: por %xmm6, %xmm0 +; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE2OR3-NEXT: pand %xmm2, %xmm0 ; SSE2OR3-NEXT: movdqa %xmm5, %xmm2 -; SSE2OR3-NEXT: pxor %xmm8, %xmm2 +; SSE2OR3-NEXT: pxor %xmm6, %xmm2 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE2OR3-NEXT: movdqa %xmm9, %xmm6 -; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm9 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,2,2] ; SSE2OR3-NEXT: pand %xmm3, %xmm2 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] ; SSE2OR3-NEXT: por %xmm2, %xmm3 ; SSE2OR3-NEXT: pand %xmm3, %xmm5 -; SSE2OR3-NEXT: pxor %xmm10, %xmm3 +; SSE2OR3-NEXT: pxor %xmm8, %xmm3 ; SSE2OR3-NEXT: por %xmm5, %xmm3 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm2 -; SSE2OR3-NEXT: pxor %xmm8, %xmm2 +; SSE2OR3-NEXT: pxor %xmm6, %xmm2 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm7 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] ; SSE2OR3-NEXT: pand %xmm5, %xmm2 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; SSE2OR3-NEXT: por %xmm2, %xmm5 -; SSE2OR3-NEXT: pxor %xmm5, %xmm10 +; SSE2OR3-NEXT: pxor %xmm5, %xmm8 ; SSE2OR3-NEXT: pand %xmm4, %xmm5 -; SSE2OR3-NEXT: por %xmm10, %xmm5 +; SSE2OR3-NEXT: por %xmm8, %xmm5 ; SSE2OR3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2] ; SSE2OR3-NEXT: movdqa %xmm1, %xmm2 ; SSE2OR3-NEXT: psubd %xmm5, %xmm2 -; SSE2OR3-NEXT: pxor %xmm8, %xmm5 -; SSE2OR3-NEXT: por %xmm8, %xmm1 +; SSE2OR3-NEXT: pxor %xmm6, %xmm5 +; SSE2OR3-NEXT: por %xmm6, %xmm1 ; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm1 ; SSE2OR3-NEXT: pand %xmm2, %xmm1 ; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: test34: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm0, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm8 +; SSE41-NEXT: pand %xmm0, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm11, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: movdqa %xmm11, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [4294967295,4294967295] -; SSE41-NEXT: movapd %xmm7, %xmm10 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE41-NEXT: movapd %xmm8, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm11, %xmm3 +; SSE41-NEXT: movdqa %xmm7, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm11, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: movdqa %xmm7, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm3 +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2] -; SSE41-NEXT: pmaxud %xmm3, %xmm8 -; SSE41-NEXT: psubd %xmm3, %xmm8 +; SSE41-NEXT: pmaxud %xmm3, %xmm6 +; SSE41-NEXT: psubd %xmm3, %xmm6 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm11, %xmm2 +; SSE41-NEXT: movdqa %xmm7, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm11, %xmm3 +; SSE41-NEXT: movdqa %xmm7, %xmm3 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm2 +; SSE41-NEXT: movapd %xmm8, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 ; SSE41-NEXT: pxor %xmm4, %xmm9 -; SSE41-NEXT: movdqa %xmm11, %xmm3 +; SSE41-NEXT: movdqa %xmm7, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm9, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm11, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 -; SSE41-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[0,2] -; SSE41-NEXT: pmaxud %xmm7, %xmm1 -; SSE41-NEXT: psubd %xmm7, %xmm1 -; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 +; SSE41-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm2[0,2] +; SSE41-NEXT: pmaxud %xmm8, %xmm1 +; SSE41-NEXT: psubd %xmm8, %xmm1 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test34: diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll --- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll +++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll @@ -65,8 +65,8 @@ ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: je LBB0_54 ; CHECK-NEXT: ## %bb.6: ## %SyTime.exit2720 -; CHECK-NEXT: movq %rdx, %rbx -; CHECK-NEXT: movq %rdi, %rbp +; CHECK-NEXT: movq %rdx, %r14 +; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: cmpq %rax, %rcx @@ -76,10 +76,10 @@ ; CHECK-NEXT: movl $32, %esi ; CHECK-NEXT: callq _memset ; CHECK-NEXT: LBB0_8: ## %while.body.preheader -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: imulq $1040, %rbx, %rax ## imm = 0x410 +; CHECK-NEXT: imulq $1040, %r14, %rax ## imm = 0x410 ; CHECK-NEXT: movq _syBuf@GOTPCREL(%rip), %rcx -; CHECK-NEXT: leaq 8(%rcx,%rax), %rdx +; CHECK-NEXT: leaq 8(%rcx,%rax), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl $1, %r15d ; CHECK-NEXT: movq _syCTRO@GOTPCREL(%rip), %rax ; CHECK-NEXT: movb $1, %cl @@ -90,14 +90,13 @@ ; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: jne LBB0_9 ; CHECK-NEXT: ## %bb.10: ## %do.end -; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: xorl %r14d, %r14d -; CHECK-NEXT: testb %r14b, %r14b +; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne LBB0_11 ; CHECK-NEXT: ## %bb.12: ## %while.body200.preheader ; CHECK-NEXT: xorl %r13d, %r13d ; CHECK-NEXT: leaq LJTI0_0(%rip), %rdx -; CHECK-NEXT: leaq LJTI0_1(%rip), %rbx +; CHECK-NEXT: leaq LJTI0_1(%rip), %r14 ; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-NEXT: xorl %r12d, %r12d ; CHECK-NEXT: jmp LBB0_13 @@ -110,19 +109,19 @@ ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 ; CHECK-NEXT: decl %r15d ; CHECK-NEXT: testl %r15d, %r15d -; CHECK-NEXT: movl %r14d, %r12d +; CHECK-NEXT: movl %ebx, %r12d ; CHECK-NEXT: jle LBB0_21 ; CHECK-NEXT: LBB0_13: ## %while.body200 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB0_28 Depth 2 ; CHECK-NEXT: ## Child Loop BB0_37 Depth 2 -; CHECK-NEXT: leal -268(%r14), %eax +; CHECK-NEXT: leal -268(%rbx), %eax ; CHECK-NEXT: cmpl $105, %eax ; CHECK-NEXT: ja LBB0_14 ; CHECK-NEXT: ## %bb.55: ## %while.body200 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: movslq (%rbx,%rax,4), %rax -; CHECK-NEXT: addq %rbx, %rax +; CHECK-NEXT: movslq (%r14,%rax,4), %rax +; CHECK-NEXT: addq %r14, %rax ; CHECK-NEXT: jmpq *%rax ; CHECK-NEXT: LBB0_25: ## %sw.bb474 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 @@ -164,7 +163,7 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_14: ## %while.body200 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: leal 1(%r14), %eax +; CHECK-NEXT: leal 1(%rbx), %eax ; CHECK-NEXT: cmpl $21, %eax ; CHECK-NEXT: ja LBB0_20 ; CHECK-NEXT: ## %bb.15: ## %while.body200 @@ -174,7 +173,7 @@ ; CHECK-NEXT: jmpq *%rax ; CHECK-NEXT: LBB0_18: ## %while.cond201.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: movl $1, %r14d +; CHECK-NEXT: movl $1, %ebx ; CHECK-NEXT: jmp LBB0_20 ; CHECK-NEXT: LBB0_44: ## %sw.bb1134 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 @@ -184,15 +183,15 @@ ; CHECK-NEXT: jb LBB0_54 ; CHECK-NEXT: ## %bb.45: ## in Loop: Header=BB0_13 Depth=1 ; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill -; CHECK-NEXT: movl $268, %r14d ## imm = 0x10C +; CHECK-NEXT: movl $268, %ebx ## imm = 0x10C ; CHECK-NEXT: jmp LBB0_20 ; CHECK-NEXT: LBB0_39: ## %sw.bb566 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: movl $20, %r14d +; CHECK-NEXT: movl $20, %ebx ; CHECK-NEXT: jmp LBB0_20 ; CHECK-NEXT: LBB0_19: ## %sw.bb243 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: movl $2, %r14d +; CHECK-NEXT: movl $2, %ebx ; CHECK-NEXT: jmp LBB0_20 ; CHECK-NEXT: LBB0_32: ## %if.end517.loopexitsplit ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 @@ -246,30 +245,30 @@ ; CHECK-NEXT: LBB0_11: ; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-NEXT: LBB0_21: ## %while.end1465 -; CHECK-NEXT: incl %r14d -; CHECK-NEXT: cmpl $16, %r14d +; CHECK-NEXT: incl %ebx +; CHECK-NEXT: cmpl $16, %ebx ; CHECK-NEXT: ja LBB0_49 ; CHECK-NEXT: ## %bb.22: ## %while.end1465 ; CHECK-NEXT: movl $83969, %eax ## imm = 0x14801 -; CHECK-NEXT: btl %r14d, %eax +; CHECK-NEXT: btl %ebx, %eax ; CHECK-NEXT: jae LBB0_49 ; CHECK-NEXT: ## %bb.23: -; CHECK-NEXT: xorl %ebp, %ebp -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload +; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload ; CHECK-NEXT: LBB0_47: ## %if.then1477 ; CHECK-NEXT: movl $1, %edx ; CHECK-NEXT: callq _write -; CHECK-NEXT: subq %rbp, %rbx +; CHECK-NEXT: subq %rbx, %r14 ; CHECK-NEXT: movq _syHistory@GOTPCREL(%rip), %rax -; CHECK-NEXT: leaq 8189(%rbx,%rax), %rax +; CHECK-NEXT: leaq 8189(%r14,%rax), %rax ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_48: ## %for.body1723 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: decq %rax ; CHECK-NEXT: jmp LBB0_48 ; CHECK-NEXT: LBB0_46: ## %if.then1477.loopexit -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload -; CHECK-NEXT: movq %rbx, %rbp +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload +; CHECK-NEXT: movq %r14, %rbx ; CHECK-NEXT: jmp LBB0_47 ; CHECK-NEXT: LBB0_16: ## %while.cond635.preheader ; CHECK-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/reverse_branches.ll b/llvm/test/CodeGen/X86/reverse_branches.ll --- a/llvm/test/CodeGen/X86/reverse_branches.ll +++ b/llvm/test/CodeGen/X86/reverse_branches.ll @@ -33,24 +33,24 @@ ; CHECK-NEXT: movq ___stack_chk_guard@GOTPCREL(%rip), %rax ; CHECK-NEXT: movq (%rax), %rax ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: xorl %r12d, %r12d +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: leaq -{{[0-9]+}}(%rsp), %r14 ; CHECK-NEXT: movq %rsp, %r15 ; CHECK-NEXT: jmp LBB0_1 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_6: ## %for.inc9 ; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: incl %r12d +; CHECK-NEXT: incl %ebx ; CHECK-NEXT: LBB0_1: ## %for.cond ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB0_3 Depth 2 -; CHECK-NEXT: cmpl $999, %r12d ## imm = 0x3E7 +; CHECK-NEXT: cmpl $999, %ebx ## imm = 0x3E7 ; CHECK-NEXT: jg LBB0_7 ; CHECK-NEXT: ## %bb.2: ## %for.cond1.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: movl $-1, %ebp ; CHECK-NEXT: movq %r15, %rdi -; CHECK-NEXT: movq %r14, %rbx +; CHECK-NEXT: movq %r14, %r12 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_3: ## %for.cond1 ; CHECK-NEXT: ## Parent Loop BB0_1 Depth=1 @@ -60,12 +60,12 @@ ; CHECK-NEXT: jg LBB0_6 ; CHECK-NEXT: ## %bb.4: ## %for.body3 ; CHECK-NEXT: ## in Loop: Header=BB0_3 Depth=2 -; CHECK-NEXT: addq $1002, %rbx ## imm = 0x3EA +; CHECK-NEXT: addq $1002, %r12 ## imm = 0x3EA ; CHECK-NEXT: leaq 1001(%rdi), %r13 ; CHECK-NEXT: movl $1000, %edx ## imm = 0x3E8 ; CHECK-NEXT: movl $120, %esi ; CHECK-NEXT: callq _memchr -; CHECK-NEXT: cmpq %rax, %rbx +; CHECK-NEXT: cmpq %rax, %r12 ; CHECK-NEXT: movq %r13, %rdi ; CHECK-NEXT: je LBB0_3 ; CHECK-NEXT: jmp LBB0_5 @@ -94,11 +94,11 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_14: ## %exit ; CHECK-NEXT: ## in Loop: Header=BB0_10 Depth=2 -; CHECK-NEXT: addq %rsi, %rbp +; CHECK-NEXT: addq %rsi, %r8 ; CHECK-NEXT: incq %rdi ; CHECK-NEXT: decq %rsi ; CHECK-NEXT: addq $1001, %rdx ## imm = 0x3E9 -; CHECK-NEXT: cmpq $-1000, %rbp ## imm = 0xFC18 +; CHECK-NEXT: cmpq $-1000, %r8 ## imm = 0xFC18 ; CHECK-NEXT: jne LBB0_5 ; CHECK-NEXT: LBB0_10: ## %for.cond18 ; CHECK-NEXT: ## Parent Loop BB0_8 Depth=1 @@ -108,17 +108,17 @@ ; CHECK-NEXT: jg LBB0_15 ; CHECK-NEXT: ## %bb.11: ## %for.body20 ; CHECK-NEXT: ## in Loop: Header=BB0_10 Depth=2 -; CHECK-NEXT: movq $-1000, %rbp ## imm = 0xFC18 +; CHECK-NEXT: movq $-1000, %r8 ## imm = 0xFC18 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_12: ## %do.body.i ; CHECK-NEXT: ## Parent Loop BB0_8 Depth=1 ; CHECK-NEXT: ## Parent Loop BB0_10 Depth=2 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=3 -; CHECK-NEXT: cmpb $120, 1000(%rdx,%rbp) +; CHECK-NEXT: cmpb $120, 1000(%rdx,%r8) ; CHECK-NEXT: je LBB0_14 ; CHECK-NEXT: ## %bb.13: ## %do.cond.i ; CHECK-NEXT: ## in Loop: Header=BB0_12 Depth=3 -; CHECK-NEXT: incq %rbp +; CHECK-NEXT: incq %r8 ; CHECK-NEXT: jne LBB0_12 ; CHECK-NEXT: LBB0_5: ## %if.then ; CHECK-NEXT: leaq L_str4(%rip), %rdi diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -355,7 +355,7 @@ ; ; AVX1-LABEL: sad_avx64i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -382,15 +382,15 @@ ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %middle.block ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm4, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpaddd %xmm8, %xmm8, %xmm7 -; AVX1-NEXT: vpaddd %xmm8, %xmm8, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm7 +; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm8 +; AVX1-NEXT: vpaddd %xmm1, %xmm8, %xmm8 +; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm8, %xmm1 -; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -1007,46 +1007,46 @@ ; ; SSE41-LABEL: v16i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm3, %xmm8 -; SSE41-NEXT: movdqa %xmm2, %xmm10 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: paddd %xmm4, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm11 +; SSE41-NEXT: movdqa %xmm2, %xmm8 +; SSE41-NEXT: movdqa %xmm1, %xmm9 +; SSE41-NEXT: movdqa %xmm0, %xmm10 +; SSE41-NEXT: paddd %xmm4, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm1 +; SSE41-NEXT: movdqa %xmm10, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] ; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm10 +; SSE41-NEXT: movdqa %xmm9, %xmm1 ; SSE41-NEXT: paddd %xmm5, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm9 +; SSE41-NEXT: pxor %xmm5, %xmm9 ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 ; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm10, %xmm2 +; SSE41-NEXT: movdqa %xmm8, %xmm2 ; SSE41-NEXT: paddd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE41-NEXT: pxor %xmm6, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE41-NEXT: pxor %xmm6, %xmm8 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 ; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: movdqa %xmm11, %xmm3 ; SSE41-NEXT: paddd %xmm7, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm8 -; SSE41-NEXT: pxor %xmm7, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE41-NEXT: pxor %xmm7, %xmm11 ; SSE41-NEXT: movdqa %xmm3, %xmm5 ; SSE41-NEXT: psrad $31, %xmm5 ; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm3 -; SSE41-NEXT: movaps %xmm9, %xmm0 +; SSE41-NEXT: movaps %xmm10, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v16i32: @@ -1267,8 +1267,8 @@ ; SSE2-NEXT: pandn %xmm0, %xmm6 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: pxor %xmm5, %xmm0 ; SSE2-NEXT: pand %xmm7, %xmm0 ; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm6 @@ -1277,20 +1277,20 @@ ; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: movdqa %xmm6, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm3 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq @@ -1320,8 +1320,8 @@ ; SSSE3-NEXT: pandn %xmm0, %xmm6 ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: pxor %xmm5, %xmm0 ; SSSE3-NEXT: pand %xmm7, %xmm0 ; SSSE3-NEXT: por %xmm6, %xmm0 ; SSSE3-NEXT: movdqa %xmm1, %xmm6 @@ -1330,20 +1330,20 @@ ; SSSE3-NEXT: pxor %xmm1, %xmm4 ; SSSE3-NEXT: movdqa %xmm6, %xmm7 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm2 +; SSSE3-NEXT: pxor %xmm6, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 ; SSSE3-NEXT: pandn %xmm1, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: pxor %xmm5, %xmm1 ; SSSE3-NEXT: pand %xmm2, %xmm1 ; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: retq @@ -1363,11 +1363,11 @@ ; SSE41-NEXT: pand %xmm7, %xmm5 ; SSE41-NEXT: por %xmm0, %xmm5 ; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm7, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm8, %xmm2 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -1382,9 +1382,9 @@ ; SSE41-NEXT: por %xmm0, %xmm2 ; SSE41-NEXT: pxor %xmm3, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: retq ; @@ -1463,88 +1463,88 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm9, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pxor %xmm11, %xmm11 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pxor %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm10 ; SSE2-NEXT: pandn %xmm0, %xmm10 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] ; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm11, %xmm0 ; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm10 ; SSE2-NEXT: pxor %xmm8, %xmm10 ; SSE2-NEXT: paddq %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: pxor %xmm8, %xmm11 ; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 ; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm10 +; SSE2-NEXT: pxor %xmm11, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm5 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: paddq %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: paddq %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm5, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] ; SSE2-NEXT: pand %xmm12, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm10, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm5 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pxor %xmm9, %xmm2 ; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 ; SSE2-NEXT: paddq %xmm7, %xmm3 ; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpgtd %xmm4, %xmm11 -; SSE2-NEXT: pxor %xmm5, %xmm11 -; SSE2-NEXT: movdqa %xmm11, %xmm4 -; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm3, %xmm5 ; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: pxor %xmm9, %xmm3 -; SSE2-NEXT: pand %xmm11, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i64: @@ -1564,88 +1564,88 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] ; SSSE3-NEXT: por %xmm9, %xmm10 ; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSSE3-NEXT: pxor %xmm11, %xmm11 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm10 +; SSSE3-NEXT: pxor %xmm11, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm11 +; SSSE3-NEXT: pxor %xmm10, %xmm11 +; SSSE3-NEXT: movdqa %xmm11, %xmm10 ; SSSE3-NEXT: pandn %xmm0, %xmm10 ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] ; SSSE3-NEXT: pxor %xmm9, %xmm0 -; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pand %xmm11, %xmm0 ; SSSE3-NEXT: por %xmm10, %xmm0 ; SSSE3-NEXT: movdqa %xmm1, %xmm10 ; SSSE3-NEXT: pxor %xmm8, %xmm10 ; SSSE3-NEXT: paddq %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm1, %xmm11 +; SSSE3-NEXT: pxor %xmm8, %xmm11 ; SSSE3-NEXT: movdqa %xmm10, %xmm12 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 ; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pandn %xmm1, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pxor %xmm10, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 +; SSSE3-NEXT: pxor %xmm11, %xmm10 +; SSSE3-NEXT: movdqa %xmm10, %xmm5 +; SSSE3-NEXT: pandn %xmm1, %xmm5 ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm10 -; SSSE3-NEXT: pxor %xmm8, %xmm10 -; SSSE3-NEXT: paddq %xmm6, %xmm2 +; SSSE3-NEXT: pand %xmm10, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 ; SSSE3-NEXT: pxor %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm10, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: paddq %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm10 +; SSSE3-NEXT: pxor %xmm8, %xmm10 +; SSSE3-NEXT: movdqa %xmm5, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] ; SSSE3-NEXT: pand %xmm12, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm10 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: pxor %xmm6, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSSE3-NEXT: pxor %xmm4, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, %xmm4 -; SSSE3-NEXT: pandn %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm10, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm5 +; SSSE3-NEXT: pandn %xmm2, %xmm5 ; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: pxor %xmm9, %xmm2 ; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: por %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm8, %xmm5 ; SSSE3-NEXT: paddq %xmm7, %xmm3 ; SSSE3-NEXT: pxor %xmm3, %xmm8 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 +; SSSE3-NEXT: pxor %xmm6, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm11 -; SSSE3-NEXT: pxor %xmm5, %xmm11 -; SSSE3-NEXT: movdqa %xmm11, %xmm4 -; SSSE3-NEXT: pandn %xmm3, %xmm4 +; SSSE3-NEXT: pandn %xmm3, %xmm5 ; SSSE3-NEXT: psrad $31, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSSE3-NEXT: pxor %xmm9, %xmm3 -; SSSE3-NEXT: pand %xmm11, %xmm3 -; SSSE3-NEXT: por %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: por %xmm5, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i64: @@ -1792,66 +1792,62 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; SSE-LABEL: v2i128: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbx ; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: addq {{[0-9]+}}(%rsp), %rcx ; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %r8 -; SSE-NEXT: seto %r10b -; SSE-NEXT: movq %r8, %rbx -; SSE-NEXT: sarq $63, %rbx -; SSE-NEXT: testb %r10b, %r10b -; SSE-NEXT: cmovneq %rbx, %rcx +; SSE-NEXT: seto %dil +; SSE-NEXT: movq %r8, %r10 +; SSE-NEXT: sarq $63, %r10 +; SSE-NEXT: testb %dil, %dil +; SSE-NEXT: cmovneq %r10, %rcx ; SSE-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 -; SSE-NEXT: xorq %r11, %rbx -; SSE-NEXT: testb %r10b, %r10b -; SSE-NEXT: cmoveq %r8, %rbx +; SSE-NEXT: xorq %r11, %r10 +; SSE-NEXT: testb %dil, %dil +; SSE-NEXT: cmoveq %r8, %r10 ; SSE-NEXT: addq %r9, %rsi ; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %rdx -; SSE-NEXT: seto %r8b -; SSE-NEXT: movq %rdx, %rdi -; SSE-NEXT: sarq $63, %rdi -; SSE-NEXT: testb %r8b, %r8b -; SSE-NEXT: cmovneq %rdi, %rsi -; SSE-NEXT: xorq %r11, %rdi -; SSE-NEXT: testb %r8b, %r8b -; SSE-NEXT: cmoveq %rdx, %rdi +; SSE-NEXT: seto %dil +; SSE-NEXT: movq %rdx, %r8 +; SSE-NEXT: sarq $63, %r8 +; SSE-NEXT: testb %dil, %dil +; SSE-NEXT: cmovneq %r8, %rsi +; SSE-NEXT: xorq %r11, %r8 +; SSE-NEXT: testb %dil, %dil +; SSE-NEXT: cmoveq %rdx, %r8 ; SSE-NEXT: movq %rcx, 16(%rax) ; SSE-NEXT: movq %rsi, (%rax) -; SSE-NEXT: movq %rbx, 24(%rax) -; SSE-NEXT: movq %rdi, 8(%rax) -; SSE-NEXT: popq %rbx +; SSE-NEXT: movq %r10, 24(%rax) +; SSE-NEXT: movq %r8, 8(%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: v2i128: ; AVX: # %bb.0: -; AVX-NEXT: pushq %rbx ; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %r8 -; AVX-NEXT: seto %r10b -; AVX-NEXT: movq %r8, %rbx -; AVX-NEXT: sarq $63, %rbx -; AVX-NEXT: testb %r10b, %r10b -; AVX-NEXT: cmovneq %rbx, %rcx +; AVX-NEXT: seto %dil +; AVX-NEXT: movq %r8, %r10 +; AVX-NEXT: sarq $63, %r10 +; AVX-NEXT: testb %dil, %dil +; AVX-NEXT: cmovneq %r10, %rcx ; AVX-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 -; AVX-NEXT: xorq %r11, %rbx -; AVX-NEXT: testb %r10b, %r10b -; AVX-NEXT: cmoveq %r8, %rbx +; AVX-NEXT: xorq %r11, %r10 +; AVX-NEXT: testb %dil, %dil +; AVX-NEXT: cmoveq %r8, %r10 ; AVX-NEXT: addq %r9, %rsi ; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rdx -; AVX-NEXT: seto %r8b -; AVX-NEXT: movq %rdx, %rdi -; AVX-NEXT: sarq $63, %rdi -; AVX-NEXT: testb %r8b, %r8b -; AVX-NEXT: cmovneq %rdi, %rsi -; AVX-NEXT: xorq %r11, %rdi -; AVX-NEXT: testb %r8b, %r8b -; AVX-NEXT: cmoveq %rdx, %rdi +; AVX-NEXT: seto %dil +; AVX-NEXT: movq %rdx, %r8 +; AVX-NEXT: sarq $63, %r8 +; AVX-NEXT: testb %dil, %dil +; AVX-NEXT: cmovneq %r8, %rsi +; AVX-NEXT: xorq %r11, %r8 +; AVX-NEXT: testb %dil, %dil +; AVX-NEXT: cmoveq %rdx, %r8 ; AVX-NEXT: movq %rcx, 16(%rax) ; AVX-NEXT: movq %rsi, (%rax) -; AVX-NEXT: movq %rbx, 24(%rax) -; AVX-NEXT: movq %rdi, 8(%rax) -; AVX-NEXT: popq %rbx +; AVX-NEXT: movq %r10, 24(%rax) +; AVX-NEXT: movq %r8, 8(%rax) ; AVX-NEXT: retq %z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z diff --git a/llvm/test/CodeGen/X86/sbb-false-dep.ll b/llvm/test/CodeGen/X86/sbb-false-dep.ll --- a/llvm/test/CodeGen/X86/sbb-false-dep.ll +++ b/llvm/test/CodeGen/X86/sbb-false-dep.ll @@ -10,11 +10,9 @@ ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movl %r8d, %r13d +; CHECK-NEXT: movl %r8d, %ebp ; CHECK-NEXT: movl %ecx, %r14d ; CHECK-NEXT: movl %edx, %r15d ; CHECK-NEXT: movq %rsi, %rbx @@ -24,16 +22,16 @@ ; CHECK-NEXT: movq %rbx, %rdx ; CHECK-NEXT: callq foo1@PLT ; CHECK-NEXT: movq 8(%rbx), %rax -; CHECK-NEXT: movq (%rax), %rdx -; CHECK-NEXT: xorl %ebp, %ebp -; CHECK-NEXT: movl %r13d, %ecx +; CHECK-NEXT: movq (%rax), %rax +; CHECK-NEXT: xorl %r10d, %r10d +; CHECK-NEXT: movl %ebp, %ecx ; CHECK-NEXT: negl %ecx -; CHECK-NEXT: movl $0, %eax -; CHECK-NEXT: sbbq %rax, %rax -; CHECK-NEXT: orq %rdx, %rax -; CHECK-NEXT: cmpl $1, %r13d -; CHECK-NEXT: sbbq %rbp, %rbp -; CHECK-NEXT: orq %rdx, %rbp +; CHECK-NEXT: movl $0, %r11d +; CHECK-NEXT: sbbq %r11, %r11 +; CHECK-NEXT: orq %rax, %r11 +; CHECK-NEXT: cmpl $1, %ebp +; CHECK-NEXT: sbbq %r10, %r10 +; CHECK-NEXT: orq %rax, %r10 ; CHECK-NEXT: subq $8, %rsp ; CHECK-NEXT: movq %r12, %rdi ; CHECK-NEXT: movl %r15d, %esi @@ -41,14 +39,13 @@ ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: xorl %r8d, %r8d ; CHECK-NEXT: xorl %r9d, %r9d -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %rax +; CHECK-NEXT: pushq %r10 +; CHECK-NEXT: pushq %r11 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: callq foo2@PLT -; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: addq $32, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp @@ -59,11 +56,9 @@ ; IDIOM-NEXT: pushq %rbp ; IDIOM-NEXT: pushq %r15 ; IDIOM-NEXT: pushq %r14 -; IDIOM-NEXT: pushq %r13 ; IDIOM-NEXT: pushq %r12 ; IDIOM-NEXT: pushq %rbx -; IDIOM-NEXT: pushq %rax -; IDIOM-NEXT: movl %r8d, %r13d +; IDIOM-NEXT: movl %r8d, %ebp ; IDIOM-NEXT: movl %ecx, %r14d ; IDIOM-NEXT: movl %edx, %r15d ; IDIOM-NEXT: movq %rsi, %rbx @@ -73,14 +68,14 @@ ; IDIOM-NEXT: movq %rbx, %rdx ; IDIOM-NEXT: callq foo1@PLT ; IDIOM-NEXT: movq 8(%rbx), %rax -; IDIOM-NEXT: movq (%rax), %rdx -; IDIOM-NEXT: movl %r13d, %ecx +; IDIOM-NEXT: movq (%rax), %rax +; IDIOM-NEXT: movl %ebp, %ecx ; IDIOM-NEXT: negl %ecx -; IDIOM-NEXT: sbbq %rbp, %rbp -; IDIOM-NEXT: orq %rdx, %rbp -; IDIOM-NEXT: cmpl $1, %r13d -; IDIOM-NEXT: sbbq %rax, %rax -; IDIOM-NEXT: orq %rdx, %rax +; IDIOM-NEXT: sbbq %r10, %r10 +; IDIOM-NEXT: orq %rax, %r10 +; IDIOM-NEXT: cmpl $1, %ebp +; IDIOM-NEXT: sbbq %r11, %r11 +; IDIOM-NEXT: orq %rax, %r11 ; IDIOM-NEXT: subq $8, %rsp ; IDIOM-NEXT: movq %r12, %rdi ; IDIOM-NEXT: movl %r15d, %esi @@ -88,14 +83,13 @@ ; IDIOM-NEXT: xorl %ecx, %ecx ; IDIOM-NEXT: xorl %r8d, %r8d ; IDIOM-NEXT: xorl %r9d, %r9d -; IDIOM-NEXT: pushq %rax -; IDIOM-NEXT: pushq %rbp +; IDIOM-NEXT: pushq %r11 +; IDIOM-NEXT: pushq %r10 ; IDIOM-NEXT: pushq %rbx ; IDIOM-NEXT: callq foo2@PLT -; IDIOM-NEXT: addq $40, %rsp +; IDIOM-NEXT: addq $32, %rsp ; IDIOM-NEXT: popq %rbx ; IDIOM-NEXT: popq %r12 -; IDIOM-NEXT: popq %r13 ; IDIOM-NEXT: popq %r14 ; IDIOM-NEXT: popq %r15 ; IDIOM-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/scalar_widen_div.ll b/llvm/test/CodeGen/X86/scalar_widen_div.ll --- a/llvm/test/CodeGen/X86/scalar_widen_div.ll +++ b/llvm/test/CodeGen/X86/scalar_widen_div.ll @@ -7,27 +7,27 @@ define void @vectorDiv (ptr addrspace(1) %nsource, ptr addrspace(1) %dsource, ptr addrspace(1) %qdest) nounwind { ; CHECK-LABEL: vectorDiv: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rdx, %r8 +; CHECK-NEXT: movq %rdx, %rcx ; CHECK-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: movq (%rdi,%rcx,8), %rdi -; CHECK-NEXT: movq (%rsi,%rcx,8), %r10 +; CHECK-NEXT: movslq -{{[0-9]+}}(%rsp), %r8 +; CHECK-NEXT: movq (%rdi,%r8,8), %rdi +; CHECK-NEXT: movq (%rsi,%r8,8), %r9 ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: shrq $32, %rax -; CHECK-NEXT: movq %r10, %rsi +; CHECK-NEXT: movq %r9, %rsi ; CHECK-NEXT: shrq $32, %rsi ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %esi -; CHECK-NEXT: movl %eax, %r9d +; CHECK-NEXT: movl %eax, %esi ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: cltd -; CHECK-NEXT: idivl %r10d +; CHECK-NEXT: idivl %r9d ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: pinsrd $1, %r9d, %xmm0 -; CHECK-NEXT: movq %xmm0, (%r8,%rcx,8) +; CHECK-NEXT: pinsrd $1, %esi, %xmm0 +; CHECK-NEXT: movq %xmm0, (%rcx,%r8,8) ; CHECK-NEXT: retq entry: %nsource.addr = alloca ptr addrspace(1), align 4 @@ -58,15 +58,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movsbl %dil, %eax ; CHECK-NEXT: idivb %cl -; CHECK-NEXT: movl %eax, %edi +; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: movsbl %sil, %eax ; CHECK-NEXT: idivb %r8b ; CHECK-NEXT: movl %eax, %esi ; CHECK-NEXT: movsbl %dl, %eax ; CHECK-NEXT: idivb %r9b -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movl %eax, %edi +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movl %edi, %ecx ; CHECK-NEXT: retq %div.r = sdiv <3 x i8> %num, %div ret <3 x i8> %div.r @@ -77,15 +78,16 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: divb %cl -; CHECK-NEXT: movl %eax, %edi +; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: movzbl %sil, %eax ; CHECK-NEXT: divb %r8b ; CHECK-NEXT: movl %eax, %esi ; CHECK-NEXT: movzbl %dl, %eax ; CHECK-NEXT: divb %r9b -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movl %eax, %edi +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movl %edi, %ecx ; CHECK-NEXT: retq %div.r = udiv <3 x i8> %num, %div ret <3 x i8> %div.r @@ -99,36 +101,36 @@ ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: cwtd ; CHECK-NEXT: idivw %cx -; CHECK-NEXT: movl %eax, %r8d +; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: pextrw $3, %xmm0, %eax -; CHECK-NEXT: pextrw $3, %xmm1, %ecx +; CHECK-NEXT: pextrw $3, %xmm1, %esi ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: cwtd -; CHECK-NEXT: idivw %cx -; CHECK-NEXT: movl %eax, %r9d +; CHECK-NEXT: idivw %si +; CHECK-NEXT: movl %eax, %esi ; CHECK-NEXT: pextrw $2, %xmm0, %eax -; CHECK-NEXT: pextrw $2, %xmm1, %ecx +; CHECK-NEXT: pextrw $2, %xmm1, %edi ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: cwtd -; CHECK-NEXT: idivw %cx +; CHECK-NEXT: idivw %di ; CHECK-NEXT: movl %eax, %edi ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: movd %xmm1, %ecx +; CHECK-NEXT: movd %xmm1, %r8d ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: cwtd -; CHECK-NEXT: idivw %cx -; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: idivw %r8w +; CHECK-NEXT: movl %eax, %r8d ; CHECK-NEXT: pextrw $1, %xmm0, %eax -; CHECK-NEXT: pextrw $1, %xmm1, %esi +; CHECK-NEXT: pextrw $1, %xmm1, %r9d ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: cwtd -; CHECK-NEXT: idivw %si +; CHECK-NEXT: idivw %r9w ; CHECK-NEXT: # kill: def $ax killed $ax def $eax -; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: movd %r8d, %xmm0 ; CHECK-NEXT: pinsrw $1, %eax, %xmm0 ; CHECK-NEXT: pinsrw $2, %edi, %xmm0 -; CHECK-NEXT: pinsrw $3, %r9d, %xmm0 -; CHECK-NEXT: pinsrw $4, %r8d, %xmm0 +; CHECK-NEXT: pinsrw $3, %esi, %xmm0 +; CHECK-NEXT: pinsrw $4, %ecx, %xmm0 ; CHECK-NEXT: retq %div.r = sdiv <5 x i16> %num, %div ret <5 x i16> %div.r @@ -285,36 +287,36 @@ ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: cwtd ; CHECK-NEXT: idivw %cx -; CHECK-NEXT: movl %edx, %r8d +; CHECK-NEXT: movl %edx, %ecx ; CHECK-NEXT: pextrw $3, %xmm0, %eax -; CHECK-NEXT: pextrw $3, %xmm1, %ecx +; CHECK-NEXT: pextrw $3, %xmm1, %esi ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: cwtd -; CHECK-NEXT: idivw %cx -; CHECK-NEXT: movl %edx, %r9d +; CHECK-NEXT: idivw %si +; CHECK-NEXT: movl %edx, %esi ; CHECK-NEXT: pextrw $2, %xmm0, %eax -; CHECK-NEXT: pextrw $2, %xmm1, %ecx +; CHECK-NEXT: pextrw $2, %xmm1, %edi ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: cwtd -; CHECK-NEXT: idivw %cx +; CHECK-NEXT: idivw %di ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: movd %xmm1, %ecx +; CHECK-NEXT: movd %xmm1, %r8d ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: cwtd -; CHECK-NEXT: idivw %cx -; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: idivw %r8w +; CHECK-NEXT: movl %edx, %r8d ; CHECK-NEXT: pextrw $1, %xmm0, %eax -; CHECK-NEXT: pextrw $1, %xmm1, %esi +; CHECK-NEXT: pextrw $1, %xmm1, %r9d ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: cwtd -; CHECK-NEXT: idivw %si +; CHECK-NEXT: idivw %r9w ; CHECK-NEXT: # kill: def $dx killed $dx def $edx -; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: movd %r8d, %xmm0 ; CHECK-NEXT: pinsrw $1, %edx, %xmm0 ; CHECK-NEXT: pinsrw $2, %edi, %xmm0 -; CHECK-NEXT: pinsrw $3, %r9d, %xmm0 -; CHECK-NEXT: pinsrw $4, %r8d, %xmm0 +; CHECK-NEXT: pinsrw $3, %esi, %xmm0 +; CHECK-NEXT: pinsrw $4, %ecx, %xmm0 ; CHECK-NEXT: retq %rem.r = srem <5 x i16> %num, %rem ret <5 x i16> %rem.r @@ -390,28 +392,28 @@ ; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle .LBB12_3 ; CHECK-NEXT: # %bb.1: # %bb.nph -; CHECK-NEXT: movl %edx, %r10d -; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: xorl %r10d, %r10d ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB12_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl (%rdi,%rcx), %r8d -; CHECK-NEXT: movl 4(%rdi,%rcx), %eax +; CHECK-NEXT: movl (%rdi,%r10), %r8d +; CHECK-NEXT: movl 4(%rdi,%r10), %eax ; CHECK-NEXT: cltd -; CHECK-NEXT: idivl 4(%rsi,%rcx) +; CHECK-NEXT: idivl 4(%rsi,%r10) ; CHECK-NEXT: movl %eax, %r9d ; CHECK-NEXT: movl %r8d, %eax ; CHECK-NEXT: cltd -; CHECK-NEXT: idivl (%rsi,%rcx) +; CHECK-NEXT: idivl (%rsi,%r10) ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: pinsrd $1, %r9d, %xmm0 -; CHECK-NEXT: movl 8(%rdi,%rcx), %eax +; CHECK-NEXT: movl 8(%rdi,%r10), %eax ; CHECK-NEXT: cltd -; CHECK-NEXT: idivl 8(%rsi,%rcx) -; CHECK-NEXT: movl %eax, 8(%rdi,%rcx) -; CHECK-NEXT: movq %xmm0, (%rdi,%rcx) -; CHECK-NEXT: addq $16, %rcx -; CHECK-NEXT: decl %r10d +; CHECK-NEXT: idivl 8(%rsi,%r10) +; CHECK-NEXT: movl %eax, 8(%rdi,%r10) +; CHECK-NEXT: movq %xmm0, (%rdi,%r10) +; CHECK-NEXT: addq $16, %r10 +; CHECK-NEXT: decl %ecx ; CHECK-NEXT: jne .LBB12_2 ; CHECK-NEXT: .LBB12_3: # %for.end ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll --- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll +++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll @@ -17,41 +17,41 @@ ; ILP-NEXT: movq %rdi, %rax ; ILP-NEXT: xorl %r8d, %r8d ; ILP-NEXT: addl %esi, %esi -; ILP-NEXT: leal 3(%rsi), %r9d +; ILP-NEXT: leal 3(%rsi), %edx +; ILP-NEXT: movl $1, %r9d +; ILP-NEXT: xorl %r10d, %r10d +; ILP-NEXT: movl %edx, %ecx +; ILP-NEXT: shldq %cl, %r9, %r10 ; ILP-NEXT: movl $1, %r11d -; ILP-NEXT: xorl %r14d, %r14d -; ILP-NEXT: movl %r9d, %ecx -; ILP-NEXT: shldq %cl, %r11, %r14 -; ILP-NEXT: movl $1, %edx -; ILP-NEXT: shlq %cl, %rdx -; ILP-NEXT: leal -125(%rsi), %r10d +; ILP-NEXT: shlq %cl, %r11 +; ILP-NEXT: leal -125(%rsi), %edi ; ILP-NEXT: xorl %ebx, %ebx -; ILP-NEXT: movl %r10d, %ecx -; ILP-NEXT: shldq %cl, %r11, %rbx -; ILP-NEXT: testb $64, %r9b -; ILP-NEXT: cmovneq %rdx, %r14 -; ILP-NEXT: cmovneq %r8, %rdx -; ILP-NEXT: movl $1, %edi -; ILP-NEXT: shlq %cl, %rdi +; ILP-NEXT: movl %edi, %ecx +; ILP-NEXT: shldq %cl, %r9, %rbx +; ILP-NEXT: testb $64, %dl +; ILP-NEXT: cmovneq %r11, %r10 +; ILP-NEXT: cmovneq %r8, %r11 +; ILP-NEXT: movl $1, %r14d +; ILP-NEXT: shlq %cl, %r14 ; ILP-NEXT: movb $125, %cl ; ILP-NEXT: subb %sil, %cl -; ILP-NEXT: shrdq %cl, %r8, %r11 +; ILP-NEXT: shrdq %cl, %r8, %r9 ; ILP-NEXT: testb $64, %cl -; ILP-NEXT: cmovneq %r8, %r11 -; ILP-NEXT: testb $64, %r10b -; ILP-NEXT: cmovneq %rdi, %rbx -; ILP-NEXT: cmovneq %r8, %rdi -; ILP-NEXT: testb %r9b, %r9b -; ILP-NEXT: cmovsq %r8, %r14 -; ILP-NEXT: cmovsq %r8, %rdx -; ILP-NEXT: movq %r14, 8(%rax) -; ILP-NEXT: movq %rdx, (%rax) +; ILP-NEXT: cmovneq %r8, %r9 +; ILP-NEXT: testb $64, %dil +; ILP-NEXT: cmovneq %r14, %rbx +; ILP-NEXT: cmovneq %r8, %r14 +; ILP-NEXT: testb %dl, %dl +; ILP-NEXT: cmovsq %r8, %r10 +; ILP-NEXT: cmovsq %r8, %r11 +; ILP-NEXT: movq %r10, 8(%rax) +; ILP-NEXT: movq %r11, (%rax) ; ILP-NEXT: cmovnsq %r8, %rbx ; ILP-NEXT: cmoveq %r8, %rbx ; ILP-NEXT: movq %rbx, 24(%rax) -; ILP-NEXT: cmovnsq %r11, %rdi -; ILP-NEXT: cmoveq %r8, %rdi -; ILP-NEXT: movq %rdi, 16(%rax) +; ILP-NEXT: cmovnsq %r9, %r14 +; ILP-NEXT: cmoveq %r8, %r14 +; ILP-NEXT: movq %r14, 16(%rax) ; ILP-NEXT: popq %rbx ; ILP-NEXT: popq %r14 ; ILP-NEXT: retq @@ -63,41 +63,41 @@ ; HYBRID-NEXT: addl %esi, %esi ; HYBRID-NEXT: movb $125, %cl ; HYBRID-NEXT: subb %sil, %cl -; HYBRID-NEXT: xorl %r8d, %r8d -; HYBRID-NEXT: movl $1, %edi +; HYBRID-NEXT: xorl %edi, %edi ; HYBRID-NEXT: movl $1, %r9d -; HYBRID-NEXT: shrdq %cl, %r8, %r9 +; HYBRID-NEXT: movl $1, %r8d +; HYBRID-NEXT: shrdq %cl, %rdi, %r8 ; HYBRID-NEXT: testb $64, %cl -; HYBRID-NEXT: cmovneq %r8, %r9 -; HYBRID-NEXT: leal 3(%rsi), %r10d +; HYBRID-NEXT: cmovneq %rdi, %r8 +; HYBRID-NEXT: leal 3(%rsi), %edx ; HYBRID-NEXT: xorl %r11d, %r11d -; HYBRID-NEXT: movl %r10d, %ecx -; HYBRID-NEXT: shldq %cl, %rdi, %r11 +; HYBRID-NEXT: movl %edx, %ecx +; HYBRID-NEXT: shldq %cl, %r9, %r11 ; HYBRID-NEXT: addb $-125, %sil ; HYBRID-NEXT: xorl %ebx, %ebx ; HYBRID-NEXT: movl %esi, %ecx -; HYBRID-NEXT: shldq %cl, %rdi, %rbx -; HYBRID-NEXT: movl $1, %edx -; HYBRID-NEXT: shlq %cl, %rdx +; HYBRID-NEXT: shldq %cl, %r9, %rbx +; HYBRID-NEXT: movl $1, %r10d +; HYBRID-NEXT: shlq %cl, %r10 ; HYBRID-NEXT: testb $64, %sil -; HYBRID-NEXT: cmovneq %rdx, %rbx -; HYBRID-NEXT: cmovneq %r8, %rdx -; HYBRID-NEXT: movl %r10d, %ecx -; HYBRID-NEXT: shlq %cl, %rdi -; HYBRID-NEXT: testb $64, %r10b -; HYBRID-NEXT: cmovneq %rdi, %r11 -; HYBRID-NEXT: cmovneq %r8, %rdi -; HYBRID-NEXT: testb %r10b, %r10b -; HYBRID-NEXT: cmovsq %r8, %r11 +; HYBRID-NEXT: cmovneq %r10, %rbx +; HYBRID-NEXT: cmovneq %rdi, %r10 +; HYBRID-NEXT: movl %edx, %ecx +; HYBRID-NEXT: shlq %cl, %r9 +; HYBRID-NEXT: testb $64, %dl +; HYBRID-NEXT: cmovneq %r9, %r11 +; HYBRID-NEXT: cmovneq %rdi, %r9 +; HYBRID-NEXT: testb %dl, %dl +; HYBRID-NEXT: cmovsq %rdi, %r11 ; HYBRID-NEXT: movq %r11, 8(%rax) -; HYBRID-NEXT: cmovsq %r8, %rdi -; HYBRID-NEXT: movq %rdi, (%rax) -; HYBRID-NEXT: cmovnsq %r8, %rbx -; HYBRID-NEXT: cmoveq %r8, %rbx +; HYBRID-NEXT: cmovsq %rdi, %r9 +; HYBRID-NEXT: movq %r9, (%rax) +; HYBRID-NEXT: cmovnsq %rdi, %rbx +; HYBRID-NEXT: cmoveq %rdi, %rbx ; HYBRID-NEXT: movq %rbx, 24(%rax) -; HYBRID-NEXT: cmovnsq %r9, %rdx -; HYBRID-NEXT: cmoveq %r8, %rdx -; HYBRID-NEXT: movq %rdx, 16(%rax) +; HYBRID-NEXT: cmovnsq %r8, %r10 +; HYBRID-NEXT: cmoveq %rdi, %r10 +; HYBRID-NEXT: movq %r10, 16(%rax) ; HYBRID-NEXT: popq %rbx ; HYBRID-NEXT: retq ; @@ -108,41 +108,41 @@ ; BURR-NEXT: addl %esi, %esi ; BURR-NEXT: movb $125, %cl ; BURR-NEXT: subb %sil, %cl -; BURR-NEXT: xorl %r8d, %r8d -; BURR-NEXT: movl $1, %edi +; BURR-NEXT: xorl %edi, %edi ; BURR-NEXT: movl $1, %r9d -; BURR-NEXT: shrdq %cl, %r8, %r9 +; BURR-NEXT: movl $1, %r8d +; BURR-NEXT: shrdq %cl, %rdi, %r8 ; BURR-NEXT: testb $64, %cl -; BURR-NEXT: cmovneq %r8, %r9 -; BURR-NEXT: leal 3(%rsi), %r10d +; BURR-NEXT: cmovneq %rdi, %r8 +; BURR-NEXT: leal 3(%rsi), %edx ; BURR-NEXT: xorl %r11d, %r11d -; BURR-NEXT: movl %r10d, %ecx -; BURR-NEXT: shldq %cl, %rdi, %r11 +; BURR-NEXT: movl %edx, %ecx +; BURR-NEXT: shldq %cl, %r9, %r11 ; BURR-NEXT: addb $-125, %sil ; BURR-NEXT: xorl %ebx, %ebx ; BURR-NEXT: movl %esi, %ecx -; BURR-NEXT: shldq %cl, %rdi, %rbx -; BURR-NEXT: movl $1, %edx -; BURR-NEXT: shlq %cl, %rdx +; BURR-NEXT: shldq %cl, %r9, %rbx +; BURR-NEXT: movl $1, %r10d +; BURR-NEXT: shlq %cl, %r10 ; BURR-NEXT: testb $64, %sil -; BURR-NEXT: cmovneq %rdx, %rbx -; BURR-NEXT: cmovneq %r8, %rdx -; BURR-NEXT: movl %r10d, %ecx -; BURR-NEXT: shlq %cl, %rdi -; BURR-NEXT: testb $64, %r10b -; BURR-NEXT: cmovneq %rdi, %r11 -; BURR-NEXT: cmovneq %r8, %rdi -; BURR-NEXT: testb %r10b, %r10b -; BURR-NEXT: cmovsq %r8, %r11 +; BURR-NEXT: cmovneq %r10, %rbx +; BURR-NEXT: cmovneq %rdi, %r10 +; BURR-NEXT: movl %edx, %ecx +; BURR-NEXT: shlq %cl, %r9 +; BURR-NEXT: testb $64, %dl +; BURR-NEXT: cmovneq %r9, %r11 +; BURR-NEXT: cmovneq %rdi, %r9 +; BURR-NEXT: testb %dl, %dl +; BURR-NEXT: cmovsq %rdi, %r11 ; BURR-NEXT: movq %r11, 8(%rax) -; BURR-NEXT: cmovsq %r8, %rdi -; BURR-NEXT: movq %rdi, (%rax) -; BURR-NEXT: cmovnsq %r8, %rbx -; BURR-NEXT: cmoveq %r8, %rbx +; BURR-NEXT: cmovsq %rdi, %r9 +; BURR-NEXT: movq %r9, (%rax) +; BURR-NEXT: cmovnsq %rdi, %rbx +; BURR-NEXT: cmoveq %rdi, %rbx ; BURR-NEXT: movq %rbx, 24(%rax) -; BURR-NEXT: cmovnsq %r9, %rdx -; BURR-NEXT: cmoveq %r8, %rdx -; BURR-NEXT: movq %rdx, 16(%rax) +; BURR-NEXT: cmovnsq %r8, %r10 +; BURR-NEXT: cmoveq %rdi, %r10 +; BURR-NEXT: movq %r10, 16(%rax) ; BURR-NEXT: popq %rbx ; BURR-NEXT: retq ; @@ -151,7 +151,7 @@ ; SRC-NEXT: pushq %rbx ; SRC-NEXT: movq %rdi, %rax ; SRC-NEXT: addl %esi, %esi -; SRC-NEXT: leal 3(%rsi), %r9d +; SRC-NEXT: leal 3(%rsi), %edx ; SRC-NEXT: movb $125, %cl ; SRC-NEXT: subb %sil, %cl ; SRC-NEXT: xorl %r8d, %r8d @@ -161,32 +161,32 @@ ; SRC-NEXT: testb $64, %cl ; SRC-NEXT: cmovneq %r8, %r10 ; SRC-NEXT: addb $-125, %sil -; SRC-NEXT: xorl %edx, %edx +; SRC-NEXT: xorl %r9d, %r9d ; SRC-NEXT: movl %esi, %ecx -; SRC-NEXT: shldq %cl, %rdi, %rdx +; SRC-NEXT: shldq %cl, %rdi, %r9 ; SRC-NEXT: xorl %r11d, %r11d -; SRC-NEXT: movl %r9d, %ecx +; SRC-NEXT: movl %edx, %ecx ; SRC-NEXT: shldq %cl, %rdi, %r11 ; SRC-NEXT: movl $1, %ebx ; SRC-NEXT: shlq %cl, %rbx -; SRC-NEXT: testb $64, %r9b +; SRC-NEXT: testb $64, %dl ; SRC-NEXT: cmovneq %rbx, %r11 ; SRC-NEXT: cmovneq %r8, %rbx ; SRC-NEXT: movl %esi, %ecx ; SRC-NEXT: shlq %cl, %rdi ; SRC-NEXT: testb $64, %sil -; SRC-NEXT: cmovneq %rdi, %rdx +; SRC-NEXT: cmovneq %rdi, %r9 ; SRC-NEXT: cmovneq %r8, %rdi -; SRC-NEXT: testb %r9b, %r9b +; SRC-NEXT: testb %dl, %dl ; SRC-NEXT: cmovnsq %r10, %rdi ; SRC-NEXT: cmoveq %r8, %rdi -; SRC-NEXT: cmovnsq %r8, %rdx -; SRC-NEXT: cmoveq %r8, %rdx +; SRC-NEXT: cmovnsq %r8, %r9 +; SRC-NEXT: cmoveq %r8, %r9 ; SRC-NEXT: cmovsq %r8, %r11 ; SRC-NEXT: cmovsq %r8, %rbx ; SRC-NEXT: movq %r11, 8(%rax) ; SRC-NEXT: movq %rbx, (%rax) -; SRC-NEXT: movq %rdx, 24(%rax) +; SRC-NEXT: movq %r9, 24(%rax) ; SRC-NEXT: movq %rdi, 16(%rax) ; SRC-NEXT: popq %rbx ; SRC-NEXT: retq @@ -194,46 +194,46 @@ ; LIN-LABEL: test1: ; LIN: # %bb.0: ; LIN-NEXT: movq %rdi, %rax -; LIN-NEXT: xorl %r9d, %r9d +; LIN-NEXT: xorl %edi, %edi ; LIN-NEXT: movl $1, %r8d ; LIN-NEXT: addl %esi, %esi ; LIN-NEXT: leal 3(%rsi), %ecx -; LIN-NEXT: movl $1, %edi -; LIN-NEXT: shlq %cl, %rdi +; LIN-NEXT: movl $1, %edx +; LIN-NEXT: shlq %cl, %rdx ; LIN-NEXT: testb $64, %cl -; LIN-NEXT: movq %rdi, %rdx -; LIN-NEXT: cmovneq %r9, %rdx +; LIN-NEXT: movq %rdx, %r9 +; LIN-NEXT: cmovneq %rdi, %r9 ; LIN-NEXT: testb %cl, %cl -; LIN-NEXT: cmovsq %r9, %rdx -; LIN-NEXT: movq %rdx, (%rax) -; LIN-NEXT: xorl %edx, %edx +; LIN-NEXT: cmovsq %rdi, %r9 +; LIN-NEXT: movq %r9, (%rax) +; LIN-NEXT: xorl %r9d, %r9d ; LIN-NEXT: # kill: def $cl killed $cl killed $ecx -; LIN-NEXT: shldq %cl, %r8, %rdx -; LIN-NEXT: cmovneq %rdi, %rdx -; LIN-NEXT: cmovsq %r9, %rdx -; LIN-NEXT: movq %rdx, 8(%rax) -; LIN-NEXT: leal -125(%rsi), %r10d -; LIN-NEXT: movl $1, %edx -; LIN-NEXT: movl %r10d, %ecx -; LIN-NEXT: shlq %cl, %rdx -; LIN-NEXT: testb $64, %r10b -; LIN-NEXT: movq %rdx, %rdi -; LIN-NEXT: cmovneq %r9, %rdi +; LIN-NEXT: shldq %cl, %r8, %r9 +; LIN-NEXT: cmovneq %rdx, %r9 +; LIN-NEXT: cmovsq %rdi, %r9 +; LIN-NEXT: movq %r9, 8(%rax) +; LIN-NEXT: leal -125(%rsi), %edx +; LIN-NEXT: movl $1, %r9d +; LIN-NEXT: movl %edx, %ecx +; LIN-NEXT: shlq %cl, %r9 +; LIN-NEXT: testb $64, %dl +; LIN-NEXT: movq %r9, %r10 +; LIN-NEXT: cmovneq %rdi, %r10 ; LIN-NEXT: movb $125, %cl ; LIN-NEXT: subb %sil, %cl ; LIN-NEXT: movl $1, %esi -; LIN-NEXT: shrdq %cl, %r9, %rsi +; LIN-NEXT: shrdq %cl, %rdi, %rsi ; LIN-NEXT: testb $64, %cl -; LIN-NEXT: cmovneq %r9, %rsi -; LIN-NEXT: cmovsq %rdi, %rsi -; LIN-NEXT: cmoveq %r9, %rsi +; LIN-NEXT: cmovneq %rdi, %rsi +; LIN-NEXT: cmovsq %r10, %rsi +; LIN-NEXT: cmoveq %rdi, %rsi ; LIN-NEXT: movq %rsi, 16(%rax) ; LIN-NEXT: xorl %esi, %esi -; LIN-NEXT: movl %r10d, %ecx +; LIN-NEXT: movl %edx, %ecx ; LIN-NEXT: shldq %cl, %r8, %rsi -; LIN-NEXT: cmovneq %rdx, %rsi -; LIN-NEXT: cmovnsq %r9, %rsi -; LIN-NEXT: cmoveq %r9, %rsi +; LIN-NEXT: cmovneq %r9, %rsi +; LIN-NEXT: cmovnsq %rdi, %rsi +; LIN-NEXT: cmoveq %rdi, %rsi ; LIN-NEXT: movq %rsi, 24(%rax) ; LIN-NEXT: retq %b = add i256 %a, 1 @@ -250,38 +250,38 @@ ; ILP-LABEL: test2: ; ILP: # %bb.0: ; ILP-NEXT: movq %rdi, %rax -; ILP-NEXT: xorl %r9d, %r9d +; ILP-NEXT: xorl %edi, %edi ; ILP-NEXT: movq %rsi, %r11 ; ILP-NEXT: negq %r11 ; ILP-NEXT: movl $0, %r10d ; ILP-NEXT: sbbq %rdx, %r10 -; ILP-NEXT: movl $0, %edi -; ILP-NEXT: sbbq %rcx, %rdi -; ILP-NEXT: sbbq %r8, %r9 -; ILP-NEXT: andq %r8, %r9 -; ILP-NEXT: bsrq %r9, %r8 +; ILP-NEXT: movl $0, %r9d +; ILP-NEXT: sbbq %rcx, %r9 +; ILP-NEXT: sbbq %r8, %rdi +; ILP-NEXT: andq %r8, %rdi +; ILP-NEXT: bsrq %rdi, %r8 ; ILP-NEXT: andq %rdx, %r10 ; ILP-NEXT: bsrq %r10, %rdx ; ILP-NEXT: xorq $63, %r8 -; ILP-NEXT: andq %rcx, %rdi -; ILP-NEXT: bsrq %rdi, %rcx +; ILP-NEXT: andq %rcx, %r9 +; ILP-NEXT: bsrq %r9, %rcx ; ILP-NEXT: xorq $63, %rcx ; ILP-NEXT: addq $64, %rcx -; ILP-NEXT: testq %r9, %r9 +; ILP-NEXT: testq %rdi, %rdi ; ILP-NEXT: cmovneq %r8, %rcx ; ILP-NEXT: xorq $63, %rdx ; ILP-NEXT: andq %rsi, %r11 -; ILP-NEXT: movl $127, %r8d -; ILP-NEXT: bsrq %r11, %rsi -; ILP-NEXT: cmoveq %r8, %rsi -; ILP-NEXT: xorq $63, %rsi -; ILP-NEXT: addq $64, %rsi +; ILP-NEXT: movl $127, %esi +; ILP-NEXT: bsrq %r11, %r8 +; ILP-NEXT: cmoveq %rsi, %r8 +; ILP-NEXT: xorq $63, %r8 +; ILP-NEXT: addq $64, %r8 ; ILP-NEXT: testq %r10, %r10 -; ILP-NEXT: cmovneq %rdx, %rsi -; ILP-NEXT: subq $-128, %rsi -; ILP-NEXT: orq %r9, %rdi -; ILP-NEXT: cmovneq %rcx, %rsi -; ILP-NEXT: movq %rsi, (%rax) +; ILP-NEXT: cmovneq %rdx, %r8 +; ILP-NEXT: subq $-128, %r8 +; ILP-NEXT: orq %rdi, %r9 +; ILP-NEXT: cmovneq %rcx, %r8 +; ILP-NEXT: movq %r8, (%rax) ; ILP-NEXT: movq $0, 24(%rax) ; ILP-NEXT: movq $0, 16(%rax) ; ILP-NEXT: movq $0, 8(%rax) @@ -290,38 +290,38 @@ ; HYBRID-LABEL: test2: ; HYBRID: # %bb.0: ; HYBRID-NEXT: movq %rdi, %rax -; HYBRID-NEXT: xorl %r9d, %r9d +; HYBRID-NEXT: xorl %edi, %edi ; HYBRID-NEXT: movq %rsi, %r11 ; HYBRID-NEXT: negq %r11 ; HYBRID-NEXT: movl $0, %r10d ; HYBRID-NEXT: sbbq %rdx, %r10 -; HYBRID-NEXT: movl $0, %edi -; HYBRID-NEXT: sbbq %rcx, %rdi -; HYBRID-NEXT: sbbq %r8, %r9 -; HYBRID-NEXT: andq %r8, %r9 -; HYBRID-NEXT: bsrq %r9, %r8 +; HYBRID-NEXT: movl $0, %r9d +; HYBRID-NEXT: sbbq %rcx, %r9 +; HYBRID-NEXT: sbbq %r8, %rdi +; HYBRID-NEXT: andq %r8, %rdi +; HYBRID-NEXT: bsrq %rdi, %r8 ; HYBRID-NEXT: xorq $63, %r8 -; HYBRID-NEXT: andq %rcx, %rdi -; HYBRID-NEXT: bsrq %rdi, %rcx +; HYBRID-NEXT: andq %rcx, %r9 +; HYBRID-NEXT: bsrq %r9, %rcx ; HYBRID-NEXT: xorq $63, %rcx ; HYBRID-NEXT: addq $64, %rcx -; HYBRID-NEXT: testq %r9, %r9 +; HYBRID-NEXT: testq %rdi, %rdi ; HYBRID-NEXT: cmovneq %r8, %rcx ; HYBRID-NEXT: andq %rdx, %r10 ; HYBRID-NEXT: bsrq %r10, %rdx ; HYBRID-NEXT: xorq $63, %rdx ; HYBRID-NEXT: andq %rsi, %r11 -; HYBRID-NEXT: movl $127, %r8d -; HYBRID-NEXT: bsrq %r11, %rsi -; HYBRID-NEXT: cmoveq %r8, %rsi -; HYBRID-NEXT: xorq $63, %rsi -; HYBRID-NEXT: addq $64, %rsi +; HYBRID-NEXT: movl $127, %esi +; HYBRID-NEXT: bsrq %r11, %r8 +; HYBRID-NEXT: cmoveq %rsi, %r8 +; HYBRID-NEXT: xorq $63, %r8 +; HYBRID-NEXT: addq $64, %r8 ; HYBRID-NEXT: testq %r10, %r10 -; HYBRID-NEXT: cmovneq %rdx, %rsi -; HYBRID-NEXT: subq $-128, %rsi -; HYBRID-NEXT: orq %r9, %rdi -; HYBRID-NEXT: cmovneq %rcx, %rsi -; HYBRID-NEXT: movq %rsi, (%rax) +; HYBRID-NEXT: cmovneq %rdx, %r8 +; HYBRID-NEXT: subq $-128, %r8 +; HYBRID-NEXT: orq %rdi, %r9 +; HYBRID-NEXT: cmovneq %rcx, %r8 +; HYBRID-NEXT: movq %r8, (%rax) ; HYBRID-NEXT: movq $0, 24(%rax) ; HYBRID-NEXT: movq $0, 16(%rax) ; HYBRID-NEXT: movq $0, 8(%rax) @@ -330,38 +330,38 @@ ; BURR-LABEL: test2: ; BURR: # %bb.0: ; BURR-NEXT: movq %rdi, %rax -; BURR-NEXT: xorl %r9d, %r9d +; BURR-NEXT: xorl %edi, %edi ; BURR-NEXT: movq %rsi, %r11 ; BURR-NEXT: negq %r11 ; BURR-NEXT: movl $0, %r10d ; BURR-NEXT: sbbq %rdx, %r10 -; BURR-NEXT: movl $0, %edi -; BURR-NEXT: sbbq %rcx, %rdi -; BURR-NEXT: sbbq %r8, %r9 -; BURR-NEXT: andq %r8, %r9 -; BURR-NEXT: bsrq %r9, %r8 +; BURR-NEXT: movl $0, %r9d +; BURR-NEXT: sbbq %rcx, %r9 +; BURR-NEXT: sbbq %r8, %rdi +; BURR-NEXT: andq %r8, %rdi +; BURR-NEXT: bsrq %rdi, %r8 ; BURR-NEXT: xorq $63, %r8 -; BURR-NEXT: andq %rcx, %rdi -; BURR-NEXT: bsrq %rdi, %rcx +; BURR-NEXT: andq %rcx, %r9 +; BURR-NEXT: bsrq %r9, %rcx ; BURR-NEXT: xorq $63, %rcx ; BURR-NEXT: addq $64, %rcx -; BURR-NEXT: testq %r9, %r9 +; BURR-NEXT: testq %rdi, %rdi ; BURR-NEXT: cmovneq %r8, %rcx ; BURR-NEXT: andq %rdx, %r10 ; BURR-NEXT: bsrq %r10, %rdx ; BURR-NEXT: xorq $63, %rdx ; BURR-NEXT: andq %rsi, %r11 -; BURR-NEXT: movl $127, %r8d -; BURR-NEXT: bsrq %r11, %rsi -; BURR-NEXT: cmoveq %r8, %rsi -; BURR-NEXT: xorq $63, %rsi -; BURR-NEXT: addq $64, %rsi +; BURR-NEXT: movl $127, %esi +; BURR-NEXT: bsrq %r11, %r8 +; BURR-NEXT: cmoveq %rsi, %r8 +; BURR-NEXT: xorq $63, %r8 +; BURR-NEXT: addq $64, %r8 ; BURR-NEXT: testq %r10, %r10 -; BURR-NEXT: cmovneq %rdx, %rsi -; BURR-NEXT: subq $-128, %rsi -; BURR-NEXT: orq %r9, %rdi -; BURR-NEXT: cmovneq %rcx, %rsi -; BURR-NEXT: movq %rsi, (%rax) +; BURR-NEXT: cmovneq %rdx, %r8 +; BURR-NEXT: subq $-128, %r8 +; BURR-NEXT: orq %rdi, %r9 +; BURR-NEXT: cmovneq %rcx, %r8 +; BURR-NEXT: movq %r8, (%rax) ; BURR-NEXT: movq $0, 24(%rax) ; BURR-NEXT: movq $0, 16(%rax) ; BURR-NEXT: movq $0, 8(%rax) @@ -391,17 +391,17 @@ ; SRC-NEXT: cmovneq %rcx, %rdx ; SRC-NEXT: bsrq %r10, %rcx ; SRC-NEXT: xorq $63, %rcx -; SRC-NEXT: bsrq %r11, %r8 -; SRC-NEXT: movl $127, %esi -; SRC-NEXT: cmovneq %r8, %rsi -; SRC-NEXT: xorq $63, %rsi -; SRC-NEXT: addq $64, %rsi +; SRC-NEXT: bsrq %r11, %rsi +; SRC-NEXT: movl $127, %r8d +; SRC-NEXT: cmovneq %rsi, %r8 +; SRC-NEXT: xorq $63, %r8 +; SRC-NEXT: addq $64, %r8 ; SRC-NEXT: testq %r10, %r10 -; SRC-NEXT: cmovneq %rcx, %rsi -; SRC-NEXT: subq $-128, %rsi +; SRC-NEXT: cmovneq %rcx, %r8 +; SRC-NEXT: subq $-128, %r8 ; SRC-NEXT: orq %r9, %rdi -; SRC-NEXT: cmovneq %rdx, %rsi -; SRC-NEXT: movq %rsi, (%rax) +; SRC-NEXT: cmovneq %rdx, %r8 +; SRC-NEXT: movq %r8, (%rax) ; SRC-NEXT: movq $0, 24(%rax) ; SRC-NEXT: movq $0, 16(%rax) ; SRC-NEXT: movq $0, 8(%rax) @@ -418,30 +418,30 @@ ; LIN-NEXT: cmovneq %rsi, %rdi ; LIN-NEXT: xorq $63, %rdi ; LIN-NEXT: addq $64, %rdi -; LIN-NEXT: xorl %r9d, %r9d -; LIN-NEXT: movl $0, %esi -; LIN-NEXT: sbbq %rdx, %rsi -; LIN-NEXT: andq %rdx, %rsi -; LIN-NEXT: bsrq %rsi, %rdx +; LIN-NEXT: xorl %esi, %esi +; LIN-NEXT: movl $0, %r9d +; LIN-NEXT: sbbq %rdx, %r9 +; LIN-NEXT: andq %rdx, %r9 +; LIN-NEXT: bsrq %r9, %rdx ; LIN-NEXT: xorq $63, %rdx -; LIN-NEXT: testq %rsi, %rsi +; LIN-NEXT: testq %r9, %r9 ; LIN-NEXT: cmoveq %rdi, %rdx ; LIN-NEXT: subq $-128, %rdx -; LIN-NEXT: movl $0, %esi -; LIN-NEXT: sbbq %rcx, %rsi -; LIN-NEXT: andq %rcx, %rsi -; LIN-NEXT: bsrq %rsi, %rcx +; LIN-NEXT: movl $0, %edi +; LIN-NEXT: sbbq %rcx, %rdi +; LIN-NEXT: andq %rcx, %rdi +; LIN-NEXT: bsrq %rdi, %rcx ; LIN-NEXT: xorq $63, %rcx ; LIN-NEXT: addq $64, %rcx -; LIN-NEXT: sbbq %r8, %r9 -; LIN-NEXT: andq %r8, %r9 -; LIN-NEXT: bsrq %r9, %rdi -; LIN-NEXT: xorq $63, %rdi -; LIN-NEXT: testq %r9, %r9 -; LIN-NEXT: cmoveq %rcx, %rdi -; LIN-NEXT: orq %rsi, %r9 -; LIN-NEXT: cmoveq %rdx, %rdi -; LIN-NEXT: movq %rdi, (%rax) +; LIN-NEXT: sbbq %r8, %rsi +; LIN-NEXT: andq %r8, %rsi +; LIN-NEXT: bsrq %rsi, %r8 +; LIN-NEXT: xorq $63, %r8 +; LIN-NEXT: testq %rsi, %rsi +; LIN-NEXT: cmoveq %rcx, %r8 +; LIN-NEXT: orq %rdi, %rsi +; LIN-NEXT: cmoveq %rdx, %r8 +; LIN-NEXT: movq %r8, (%rax) ; LIN-NEXT: movq $0, 8(%rax) ; LIN-NEXT: movq $0, 16(%rax) ; LIN-NEXT: movq $0, 24(%rax) @@ -457,41 +457,41 @@ ; ILP: # %bb.0: ; ILP-NEXT: pushq %rbx ; ILP-NEXT: movq %rdi, %rax -; ILP-NEXT: xorl %edi, %edi -; ILP-NEXT: movq %rsi, %r9 -; ILP-NEXT: negq %r9 +; ILP-NEXT: xorl %r9d, %r9d +; ILP-NEXT: movq %rsi, %rdi +; ILP-NEXT: negq %rdi ; ILP-NEXT: movl $0, %r10d ; ILP-NEXT: sbbq %rdx, %r10 ; ILP-NEXT: movl $0, %r11d ; ILP-NEXT: sbbq %rcx, %r11 -; ILP-NEXT: sbbq %r8, %rdi +; ILP-NEXT: sbbq %r8, %r9 ; ILP-NEXT: notq %r8 -; ILP-NEXT: andq %rdi, %r8 +; ILP-NEXT: andq %r9, %r8 ; ILP-NEXT: bsrq %r8, %rbx ; ILP-NEXT: notq %rdx ; ILP-NEXT: andq %r10, %rdx -; ILP-NEXT: bsrq %rdx, %r10 +; ILP-NEXT: bsrq %rdx, %r9 ; ILP-NEXT: notq %rsi ; ILP-NEXT: xorq $63, %rbx ; ILP-NEXT: notq %rcx ; ILP-NEXT: andq %r11, %rcx -; ILP-NEXT: bsrq %rcx, %rdi -; ILP-NEXT: xorq $63, %rdi -; ILP-NEXT: addq $64, %rdi -; ILP-NEXT: testq %r8, %r8 -; ILP-NEXT: cmovneq %rbx, %rdi +; ILP-NEXT: bsrq %rcx, %r10 ; ILP-NEXT: xorq $63, %r10 -; ILP-NEXT: andq %r9, %rsi -; ILP-NEXT: movl $127, %ebx +; ILP-NEXT: addq $64, %r10 +; ILP-NEXT: testq %r8, %r8 +; ILP-NEXT: cmovneq %rbx, %r10 +; ILP-NEXT: xorq $63, %r9 +; ILP-NEXT: andq %rdi, %rsi +; ILP-NEXT: movl $127, %edi ; ILP-NEXT: bsrq %rsi, %rsi -; ILP-NEXT: cmoveq %rbx, %rsi +; ILP-NEXT: cmoveq %rdi, %rsi ; ILP-NEXT: xorq $63, %rsi ; ILP-NEXT: addq $64, %rsi ; ILP-NEXT: testq %rdx, %rdx -; ILP-NEXT: cmovneq %r10, %rsi +; ILP-NEXT: cmovneq %r9, %rsi ; ILP-NEXT: subq $-128, %rsi ; ILP-NEXT: orq %r8, %rcx -; ILP-NEXT: cmovneq %rdi, %rsi +; ILP-NEXT: cmovneq %r10, %rsi ; ILP-NEXT: movq %rsi, (%rax) ; ILP-NEXT: movq $0, 24(%rax) ; ILP-NEXT: movq $0, 16(%rax) @@ -503,41 +503,41 @@ ; HYBRID: # %bb.0: ; HYBRID-NEXT: pushq %rbx ; HYBRID-NEXT: movq %rdi, %rax -; HYBRID-NEXT: xorl %edi, %edi -; HYBRID-NEXT: movq %rsi, %r9 -; HYBRID-NEXT: negq %r9 +; HYBRID-NEXT: xorl %r9d, %r9d +; HYBRID-NEXT: movq %rsi, %rdi +; HYBRID-NEXT: negq %rdi ; HYBRID-NEXT: movl $0, %r10d ; HYBRID-NEXT: sbbq %rdx, %r10 ; HYBRID-NEXT: movl $0, %r11d ; HYBRID-NEXT: sbbq %rcx, %r11 -; HYBRID-NEXT: sbbq %r8, %rdi +; HYBRID-NEXT: sbbq %r8, %r9 ; HYBRID-NEXT: notq %r8 -; HYBRID-NEXT: andq %rdi, %r8 +; HYBRID-NEXT: andq %r9, %r8 ; HYBRID-NEXT: bsrq %r8, %rbx ; HYBRID-NEXT: xorq $63, %rbx ; HYBRID-NEXT: notq %rcx ; HYBRID-NEXT: andq %r11, %rcx -; HYBRID-NEXT: bsrq %rcx, %rdi -; HYBRID-NEXT: xorq $63, %rdi -; HYBRID-NEXT: addq $64, %rdi +; HYBRID-NEXT: bsrq %rcx, %r9 +; HYBRID-NEXT: xorq $63, %r9 +; HYBRID-NEXT: addq $64, %r9 ; HYBRID-NEXT: testq %r8, %r8 -; HYBRID-NEXT: cmovneq %rbx, %rdi +; HYBRID-NEXT: cmovneq %rbx, %r9 ; HYBRID-NEXT: notq %rdx ; HYBRID-NEXT: andq %r10, %rdx -; HYBRID-NEXT: bsrq %rdx, %rbx -; HYBRID-NEXT: xorq $63, %rbx +; HYBRID-NEXT: bsrq %rdx, %r10 +; HYBRID-NEXT: xorq $63, %r10 ; HYBRID-NEXT: notq %rsi -; HYBRID-NEXT: andq %r9, %rsi -; HYBRID-NEXT: movl $127, %r9d +; HYBRID-NEXT: andq %rdi, %rsi +; HYBRID-NEXT: movl $127, %edi ; HYBRID-NEXT: bsrq %rsi, %rsi -; HYBRID-NEXT: cmoveq %r9, %rsi +; HYBRID-NEXT: cmoveq %rdi, %rsi ; HYBRID-NEXT: xorq $63, %rsi ; HYBRID-NEXT: addq $64, %rsi ; HYBRID-NEXT: testq %rdx, %rdx -; HYBRID-NEXT: cmovneq %rbx, %rsi +; HYBRID-NEXT: cmovneq %r10, %rsi ; HYBRID-NEXT: subq $-128, %rsi ; HYBRID-NEXT: orq %r8, %rcx -; HYBRID-NEXT: cmovneq %rdi, %rsi +; HYBRID-NEXT: cmovneq %r9, %rsi ; HYBRID-NEXT: movq %rsi, (%rax) ; HYBRID-NEXT: movq $0, 24(%rax) ; HYBRID-NEXT: movq $0, 16(%rax) @@ -549,41 +549,41 @@ ; BURR: # %bb.0: ; BURR-NEXT: pushq %rbx ; BURR-NEXT: movq %rdi, %rax -; BURR-NEXT: xorl %edi, %edi -; BURR-NEXT: movq %rsi, %r9 -; BURR-NEXT: negq %r9 +; BURR-NEXT: xorl %r9d, %r9d +; BURR-NEXT: movq %rsi, %rdi +; BURR-NEXT: negq %rdi ; BURR-NEXT: movl $0, %r10d ; BURR-NEXT: sbbq %rdx, %r10 ; BURR-NEXT: movl $0, %r11d ; BURR-NEXT: sbbq %rcx, %r11 -; BURR-NEXT: sbbq %r8, %rdi +; BURR-NEXT: sbbq %r8, %r9 ; BURR-NEXT: notq %r8 -; BURR-NEXT: andq %rdi, %r8 +; BURR-NEXT: andq %r9, %r8 ; BURR-NEXT: bsrq %r8, %rbx ; BURR-NEXT: xorq $63, %rbx ; BURR-NEXT: notq %rcx ; BURR-NEXT: andq %r11, %rcx -; BURR-NEXT: bsrq %rcx, %rdi -; BURR-NEXT: xorq $63, %rdi -; BURR-NEXT: addq $64, %rdi +; BURR-NEXT: bsrq %rcx, %r9 +; BURR-NEXT: xorq $63, %r9 +; BURR-NEXT: addq $64, %r9 ; BURR-NEXT: testq %r8, %r8 -; BURR-NEXT: cmovneq %rbx, %rdi +; BURR-NEXT: cmovneq %rbx, %r9 ; BURR-NEXT: notq %rdx ; BURR-NEXT: andq %r10, %rdx -; BURR-NEXT: bsrq %rdx, %rbx -; BURR-NEXT: xorq $63, %rbx +; BURR-NEXT: bsrq %rdx, %r10 +; BURR-NEXT: xorq $63, %r10 ; BURR-NEXT: notq %rsi -; BURR-NEXT: andq %r9, %rsi -; BURR-NEXT: movl $127, %r9d +; BURR-NEXT: andq %rdi, %rsi +; BURR-NEXT: movl $127, %edi ; BURR-NEXT: bsrq %rsi, %rsi -; BURR-NEXT: cmoveq %r9, %rsi +; BURR-NEXT: cmoveq %rdi, %rsi ; BURR-NEXT: xorq $63, %rsi ; BURR-NEXT: addq $64, %rsi ; BURR-NEXT: testq %rdx, %rdx -; BURR-NEXT: cmovneq %rbx, %rsi +; BURR-NEXT: cmovneq %r10, %rsi ; BURR-NEXT: subq $-128, %rsi ; BURR-NEXT: orq %r8, %rcx -; BURR-NEXT: cmovneq %rdi, %rsi +; BURR-NEXT: cmovneq %r9, %rsi ; BURR-NEXT: movq %rsi, (%rax) ; BURR-NEXT: movq $0, 24(%rax) ; BURR-NEXT: movq $0, 16(%rax) @@ -594,42 +594,42 @@ ; SRC-LABEL: test3: ; SRC: # %bb.0: ; SRC-NEXT: movq %rdi, %rax -; SRC-NEXT: movq %rsi, %r9 -; SRC-NEXT: notq %r9 -; SRC-NEXT: xorl %r10d, %r10d +; SRC-NEXT: movq %rsi, %rdi +; SRC-NEXT: notq %rdi +; SRC-NEXT: xorl %r9d, %r9d ; SRC-NEXT: negq %rsi -; SRC-NEXT: movl $0, %r11d -; SRC-NEXT: sbbq %rdx, %r11 +; SRC-NEXT: movl $0, %r10d +; SRC-NEXT: sbbq %rdx, %r10 ; SRC-NEXT: notq %rdx -; SRC-NEXT: movl $0, %edi -; SRC-NEXT: sbbq %rcx, %rdi +; SRC-NEXT: movl $0, %r11d +; SRC-NEXT: sbbq %rcx, %r11 ; SRC-NEXT: notq %rcx -; SRC-NEXT: sbbq %r8, %r10 +; SRC-NEXT: sbbq %r8, %r9 ; SRC-NEXT: notq %r8 -; SRC-NEXT: andq %r11, %rdx -; SRC-NEXT: andq %rdi, %rcx -; SRC-NEXT: andq %r10, %r8 -; SRC-NEXT: andq %r9, %rsi -; SRC-NEXT: bsrq %r8, %r9 -; SRC-NEXT: xorq $63, %r9 -; SRC-NEXT: bsrq %rcx, %rdi +; SRC-NEXT: andq %r10, %rdx +; SRC-NEXT: andq %r11, %rcx +; SRC-NEXT: andq %r9, %r8 +; SRC-NEXT: andq %rdi, %rsi +; SRC-NEXT: bsrq %r8, %rdi ; SRC-NEXT: xorq $63, %rdi -; SRC-NEXT: addq $64, %rdi -; SRC-NEXT: testq %r8, %r8 -; SRC-NEXT: cmovneq %r9, %rdi -; SRC-NEXT: bsrq %rdx, %r9 +; SRC-NEXT: bsrq %rcx, %r9 ; SRC-NEXT: xorq $63, %r9 -; SRC-NEXT: bsrq %rsi, %r10 -; SRC-NEXT: movl $127, %esi -; SRC-NEXT: cmovneq %r10, %rsi -; SRC-NEXT: xorq $63, %rsi -; SRC-NEXT: addq $64, %rsi +; SRC-NEXT: addq $64, %r9 +; SRC-NEXT: testq %r8, %r8 +; SRC-NEXT: cmovneq %rdi, %r9 +; SRC-NEXT: bsrq %rdx, %rdi +; SRC-NEXT: xorq $63, %rdi +; SRC-NEXT: bsrq %rsi, %rsi +; SRC-NEXT: movl $127, %r10d +; SRC-NEXT: cmovneq %rsi, %r10 +; SRC-NEXT: xorq $63, %r10 +; SRC-NEXT: addq $64, %r10 ; SRC-NEXT: testq %rdx, %rdx -; SRC-NEXT: cmovneq %r9, %rsi -; SRC-NEXT: subq $-128, %rsi +; SRC-NEXT: cmovneq %rdi, %r10 +; SRC-NEXT: subq $-128, %r10 ; SRC-NEXT: orq %rcx, %r8 -; SRC-NEXT: cmovneq %rdi, %rsi -; SRC-NEXT: movq %rsi, (%rax) +; SRC-NEXT: cmovneq %r9, %r10 +; SRC-NEXT: movq %r10, (%rax) ; SRC-NEXT: movq $0, 24(%rax) ; SRC-NEXT: movq $0, 16(%rax) ; SRC-NEXT: movq $0, 8(%rax) @@ -643,11 +643,11 @@ ; LIN-NEXT: notq %rsi ; LIN-NEXT: andq %rdi, %rsi ; LIN-NEXT: bsrq %rsi, %rsi -; LIN-NEXT: movl $127, %edi -; LIN-NEXT: cmovneq %rsi, %rdi -; LIN-NEXT: xorq $63, %rdi -; LIN-NEXT: addq $64, %rdi -; LIN-NEXT: xorl %r9d, %r9d +; LIN-NEXT: movl $127, %r9d +; LIN-NEXT: cmovneq %rsi, %r9 +; LIN-NEXT: xorq $63, %r9 +; LIN-NEXT: addq $64, %r9 +; LIN-NEXT: xorl %edi, %edi ; LIN-NEXT: movl $0, %esi ; LIN-NEXT: sbbq %rdx, %rsi ; LIN-NEXT: notq %rdx @@ -655,7 +655,7 @@ ; LIN-NEXT: bsrq %rdx, %rsi ; LIN-NEXT: xorq $63, %rsi ; LIN-NEXT: testq %rdx, %rdx -; LIN-NEXT: cmoveq %rdi, %rsi +; LIN-NEXT: cmoveq %r9, %rsi ; LIN-NEXT: subq $-128, %rsi ; LIN-NEXT: movl $0, %edx ; LIN-NEXT: sbbq %rcx, %rdx @@ -664,9 +664,9 @@ ; LIN-NEXT: bsrq %rcx, %rdx ; LIN-NEXT: xorq $63, %rdx ; LIN-NEXT: addq $64, %rdx -; LIN-NEXT: sbbq %r8, %r9 +; LIN-NEXT: sbbq %r8, %rdi ; LIN-NEXT: notq %r8 -; LIN-NEXT: andq %r9, %r8 +; LIN-NEXT: andq %rdi, %r8 ; LIN-NEXT: bsrq %r8, %rdi ; LIN-NEXT: xorq $63, %rdi ; LIN-NEXT: testq %r8, %r8 @@ -777,27 +777,27 @@ ; ILP: # %bb.0: ; ILP-NEXT: pushq %rbx ; ILP-NEXT: movq %rdi, %rax -; ILP-NEXT: xorl %r9d, %r9d +; ILP-NEXT: xorl %edi, %edi ; ILP-NEXT: movq %rsi, %rbx ; ILP-NEXT: negq %rbx ; ILP-NEXT: movl $0, %r11d ; ILP-NEXT: sbbq %rdx, %r11 +; ILP-NEXT: movl $0, %r9d +; ILP-NEXT: sbbq %rcx, %r9 ; ILP-NEXT: movl $0, %r10d -; ILP-NEXT: sbbq %rcx, %r10 -; ILP-NEXT: movl $0, %edi -; ILP-NEXT: sbbq %r8, %rdi +; ILP-NEXT: sbbq %r8, %r10 ; ILP-NEXT: orq %r8, %rdx ; ILP-NEXT: orq %rcx, %rsi ; ILP-NEXT: orq %rdx, %rsi ; ILP-NEXT: je .LBB4_1 ; ILP-NEXT: # %bb.2: # %cond.false ; ILP-NEXT: bsrq %r11, %rdx -; ILP-NEXT: bsrq %rdi, %rcx +; ILP-NEXT: bsrq %r10, %rcx ; ILP-NEXT: xorq $63, %rcx -; ILP-NEXT: bsrq %r10, %rsi +; ILP-NEXT: bsrq %r9, %rsi ; ILP-NEXT: xorq $63, %rsi ; ILP-NEXT: addq $64, %rsi -; ILP-NEXT: testq %rdi, %rdi +; ILP-NEXT: testq %r10, %r10 ; ILP-NEXT: cmovneq %rcx, %rsi ; ILP-NEXT: xorq $63, %rdx ; ILP-NEXT: bsrq %rbx, %rcx @@ -806,17 +806,17 @@ ; ILP-NEXT: testq %r11, %r11 ; ILP-NEXT: cmovneq %rdx, %rcx ; ILP-NEXT: subq $-128, %rcx -; ILP-NEXT: xorl %r9d, %r9d -; ILP-NEXT: orq %rdi, %r10 +; ILP-NEXT: xorl %edi, %edi +; ILP-NEXT: orq %r10, %r9 ; ILP-NEXT: cmovneq %rsi, %rcx ; ILP-NEXT: jmp .LBB4_3 ; ILP-NEXT: .LBB4_1: ; ILP-NEXT: movl $256, %ecx # imm = 0x100 ; ILP-NEXT: .LBB4_3: # %cond.end ; ILP-NEXT: movq %rcx, (%rax) -; ILP-NEXT: movq %r9, 8(%rax) -; ILP-NEXT: movq %r9, 16(%rax) -; ILP-NEXT: movq %r9, 24(%rax) +; ILP-NEXT: movq %rdi, 8(%rax) +; ILP-NEXT: movq %rdi, 16(%rax) +; ILP-NEXT: movq %rdi, 24(%rax) ; ILP-NEXT: popq %rbx ; ILP-NEXT: retq ; @@ -824,26 +824,26 @@ ; HYBRID: # %bb.0: ; HYBRID-NEXT: pushq %rbx ; HYBRID-NEXT: movq %rdi, %rax -; HYBRID-NEXT: xorl %r9d, %r9d +; HYBRID-NEXT: xorl %edi, %edi ; HYBRID-NEXT: movq %rsi, %rbx ; HYBRID-NEXT: negq %rbx ; HYBRID-NEXT: movl $0, %r11d ; HYBRID-NEXT: sbbq %rdx, %r11 +; HYBRID-NEXT: movl $0, %r9d +; HYBRID-NEXT: sbbq %rcx, %r9 ; HYBRID-NEXT: movl $0, %r10d -; HYBRID-NEXT: sbbq %rcx, %r10 -; HYBRID-NEXT: movl $0, %edi -; HYBRID-NEXT: sbbq %r8, %rdi +; HYBRID-NEXT: sbbq %r8, %r10 ; HYBRID-NEXT: orq %r8, %rdx ; HYBRID-NEXT: orq %rcx, %rsi ; HYBRID-NEXT: orq %rdx, %rsi ; HYBRID-NEXT: je .LBB4_1 ; HYBRID-NEXT: # %bb.2: # %cond.false -; HYBRID-NEXT: bsrq %rdi, %rcx +; HYBRID-NEXT: bsrq %r10, %rcx ; HYBRID-NEXT: xorq $63, %rcx -; HYBRID-NEXT: bsrq %r10, %rdx +; HYBRID-NEXT: bsrq %r9, %rdx ; HYBRID-NEXT: xorq $63, %rdx ; HYBRID-NEXT: addq $64, %rdx -; HYBRID-NEXT: testq %rdi, %rdi +; HYBRID-NEXT: testq %r10, %r10 ; HYBRID-NEXT: cmovneq %rcx, %rdx ; HYBRID-NEXT: bsrq %r11, %rsi ; HYBRID-NEXT: xorq $63, %rsi @@ -853,17 +853,17 @@ ; HYBRID-NEXT: testq %r11, %r11 ; HYBRID-NEXT: cmovneq %rsi, %rcx ; HYBRID-NEXT: subq $-128, %rcx -; HYBRID-NEXT: orq %rdi, %r10 +; HYBRID-NEXT: orq %r10, %r9 ; HYBRID-NEXT: cmovneq %rdx, %rcx -; HYBRID-NEXT: xorl %r9d, %r9d +; HYBRID-NEXT: xorl %edi, %edi ; HYBRID-NEXT: jmp .LBB4_3 ; HYBRID-NEXT: .LBB4_1: ; HYBRID-NEXT: movl $256, %ecx # imm = 0x100 ; HYBRID-NEXT: .LBB4_3: # %cond.end ; HYBRID-NEXT: movq %rcx, (%rax) -; HYBRID-NEXT: movq %r9, 8(%rax) -; HYBRID-NEXT: movq %r9, 16(%rax) -; HYBRID-NEXT: movq %r9, 24(%rax) +; HYBRID-NEXT: movq %rdi, 8(%rax) +; HYBRID-NEXT: movq %rdi, 16(%rax) +; HYBRID-NEXT: movq %rdi, 24(%rax) ; HYBRID-NEXT: popq %rbx ; HYBRID-NEXT: retq ; @@ -871,26 +871,26 @@ ; BURR: # %bb.0: ; BURR-NEXT: pushq %rbx ; BURR-NEXT: movq %rdi, %rax -; BURR-NEXT: xorl %r9d, %r9d +; BURR-NEXT: xorl %edi, %edi ; BURR-NEXT: movq %rsi, %rbx ; BURR-NEXT: negq %rbx ; BURR-NEXT: movl $0, %r11d ; BURR-NEXT: sbbq %rdx, %r11 +; BURR-NEXT: movl $0, %r9d +; BURR-NEXT: sbbq %rcx, %r9 ; BURR-NEXT: movl $0, %r10d -; BURR-NEXT: sbbq %rcx, %r10 -; BURR-NEXT: movl $0, %edi -; BURR-NEXT: sbbq %r8, %rdi +; BURR-NEXT: sbbq %r8, %r10 ; BURR-NEXT: orq %r8, %rdx ; BURR-NEXT: orq %rcx, %rsi ; BURR-NEXT: orq %rdx, %rsi ; BURR-NEXT: je .LBB4_1 ; BURR-NEXT: # %bb.2: # %cond.false -; BURR-NEXT: bsrq %rdi, %rcx +; BURR-NEXT: bsrq %r10, %rcx ; BURR-NEXT: xorq $63, %rcx -; BURR-NEXT: bsrq %r10, %rdx +; BURR-NEXT: bsrq %r9, %rdx ; BURR-NEXT: xorq $63, %rdx ; BURR-NEXT: addq $64, %rdx -; BURR-NEXT: testq %rdi, %rdi +; BURR-NEXT: testq %r10, %r10 ; BURR-NEXT: cmovneq %rcx, %rdx ; BURR-NEXT: bsrq %r11, %rsi ; BURR-NEXT: xorq $63, %rsi @@ -900,17 +900,17 @@ ; BURR-NEXT: testq %r11, %r11 ; BURR-NEXT: cmovneq %rsi, %rcx ; BURR-NEXT: subq $-128, %rcx -; BURR-NEXT: orq %rdi, %r10 +; BURR-NEXT: orq %r10, %r9 ; BURR-NEXT: cmovneq %rdx, %rcx -; BURR-NEXT: xorl %r9d, %r9d +; BURR-NEXT: xorl %edi, %edi ; BURR-NEXT: jmp .LBB4_3 ; BURR-NEXT: .LBB4_1: ; BURR-NEXT: movl $256, %ecx # imm = 0x100 ; BURR-NEXT: .LBB4_3: # %cond.end ; BURR-NEXT: movq %rcx, (%rax) -; BURR-NEXT: movq %r9, 8(%rax) -; BURR-NEXT: movq %r9, 16(%rax) -; BURR-NEXT: movq %r9, 24(%rax) +; BURR-NEXT: movq %rdi, 8(%rax) +; BURR-NEXT: movq %rdi, 16(%rax) +; BURR-NEXT: movq %rdi, 24(%rax) ; BURR-NEXT: popq %rbx ; BURR-NEXT: retq ; @@ -918,26 +918,26 @@ ; SRC: # %bb.0: ; SRC-NEXT: pushq %rbx ; SRC-NEXT: movq %rdi, %rax -; SRC-NEXT: xorl %r9d, %r9d +; SRC-NEXT: xorl %edi, %edi ; SRC-NEXT: movq %rsi, %rbx ; SRC-NEXT: negq %rbx ; SRC-NEXT: movl $0, %r11d ; SRC-NEXT: sbbq %rdx, %r11 +; SRC-NEXT: movl $0, %r9d +; SRC-NEXT: sbbq %rcx, %r9 ; SRC-NEXT: movl $0, %r10d -; SRC-NEXT: sbbq %rcx, %r10 -; SRC-NEXT: movl $0, %edi -; SRC-NEXT: sbbq %r8, %rdi +; SRC-NEXT: sbbq %r8, %r10 ; SRC-NEXT: orq %r8, %rdx ; SRC-NEXT: orq %rcx, %rsi ; SRC-NEXT: orq %rdx, %rsi ; SRC-NEXT: je .LBB4_1 ; SRC-NEXT: # %bb.2: # %cond.false -; SRC-NEXT: bsrq %rdi, %rcx +; SRC-NEXT: bsrq %r10, %rcx ; SRC-NEXT: xorq $63, %rcx -; SRC-NEXT: bsrq %r10, %rdx +; SRC-NEXT: bsrq %r9, %rdx ; SRC-NEXT: xorq $63, %rdx ; SRC-NEXT: addq $64, %rdx -; SRC-NEXT: testq %rdi, %rdi +; SRC-NEXT: testq %r10, %r10 ; SRC-NEXT: cmovneq %rcx, %rdx ; SRC-NEXT: bsrq %r11, %rsi ; SRC-NEXT: xorq $63, %rsi @@ -947,17 +947,17 @@ ; SRC-NEXT: testq %r11, %r11 ; SRC-NEXT: cmovneq %rsi, %rcx ; SRC-NEXT: subq $-128, %rcx -; SRC-NEXT: orq %rdi, %r10 +; SRC-NEXT: orq %r10, %r9 ; SRC-NEXT: cmovneq %rdx, %rcx -; SRC-NEXT: xorl %r9d, %r9d +; SRC-NEXT: xorl %edi, %edi ; SRC-NEXT: jmp .LBB4_3 ; SRC-NEXT: .LBB4_1: ; SRC-NEXT: movl $256, %ecx # imm = 0x100 ; SRC-NEXT: .LBB4_3: # %cond.end ; SRC-NEXT: movq %rcx, (%rax) -; SRC-NEXT: movq %r9, 8(%rax) -; SRC-NEXT: movq %r9, 16(%rax) -; SRC-NEXT: movq %r9, 24(%rax) +; SRC-NEXT: movq %rdi, 8(%rax) +; SRC-NEXT: movq %rdi, 16(%rax) +; SRC-NEXT: movq %rdi, 24(%rax) ; SRC-NEXT: popq %rbx ; SRC-NEXT: retq ; @@ -967,13 +967,13 @@ ; LIN-NEXT: movq %rdi, %rax ; LIN-NEXT: movq %rsi, %rbx ; LIN-NEXT: negq %rbx -; LIN-NEXT: xorl %r9d, %r9d -; LIN-NEXT: movl $0, %edi -; LIN-NEXT: sbbq %rdx, %rdi -; LIN-NEXT: movl $0, %r10d -; LIN-NEXT: sbbq %rcx, %r10 +; LIN-NEXT: xorl %edi, %edi ; LIN-NEXT: movl $0, %r11d -; LIN-NEXT: sbbq %r8, %r11 +; LIN-NEXT: sbbq %rdx, %r11 +; LIN-NEXT: movl $0, %r9d +; LIN-NEXT: sbbq %rcx, %r9 +; LIN-NEXT: movl $0, %r10d +; LIN-NEXT: sbbq %r8, %r10 ; LIN-NEXT: orq %rcx, %rsi ; LIN-NEXT: orq %r8, %rdx ; LIN-NEXT: orq %rsi, %rdx @@ -982,29 +982,29 @@ ; LIN-NEXT: bsrq %rbx, %rcx ; LIN-NEXT: xorq $63, %rcx ; LIN-NEXT: addq $64, %rcx -; LIN-NEXT: bsrq %rdi, %rdx +; LIN-NEXT: bsrq %r11, %rdx ; LIN-NEXT: xorq $63, %rdx -; LIN-NEXT: testq %rdi, %rdi +; LIN-NEXT: testq %r11, %r11 ; LIN-NEXT: cmoveq %rcx, %rdx ; LIN-NEXT: subq $-128, %rdx -; LIN-NEXT: bsrq %r10, %rsi +; LIN-NEXT: bsrq %r9, %rsi ; LIN-NEXT: xorq $63, %rsi ; LIN-NEXT: addq $64, %rsi -; LIN-NEXT: bsrq %r11, %rcx +; LIN-NEXT: bsrq %r10, %rcx ; LIN-NEXT: xorq $63, %rcx -; LIN-NEXT: testq %r11, %r11 +; LIN-NEXT: testq %r10, %r10 ; LIN-NEXT: cmoveq %rsi, %rcx -; LIN-NEXT: orq %r11, %r10 +; LIN-NEXT: orq %r10, %r9 ; LIN-NEXT: cmoveq %rdx, %rcx -; LIN-NEXT: xorl %r9d, %r9d +; LIN-NEXT: xorl %edi, %edi ; LIN-NEXT: jmp .LBB4_3 ; LIN-NEXT: .LBB4_1: ; LIN-NEXT: movl $256, %ecx # imm = 0x100 ; LIN-NEXT: .LBB4_3: # %cond.end ; LIN-NEXT: movq %rcx, (%rax) -; LIN-NEXT: movq %r9, 8(%rax) -; LIN-NEXT: movq %r9, 16(%rax) -; LIN-NEXT: movq %r9, 24(%rax) +; LIN-NEXT: movq %rdi, 8(%rax) +; LIN-NEXT: movq %rdi, 16(%rax) +; LIN-NEXT: movq %rdi, 24(%rax) ; LIN-NEXT: popq %rbx ; LIN-NEXT: retq %b = sub i256 0, %a diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -194,7 +194,6 @@ define i4 @func4(i4 %x, i4 %y) nounwind { ; X64-LABEL: func4: ; X64: # %bb.0: -; X64-NEXT: pushq %rbx ; X64-NEXT: shlb $4, %sil ; X64-NEXT: sarb $4, %sil ; X64-NEXT: shlb $4, %dil @@ -203,21 +202,20 @@ ; X64-NEXT: movsbl %dil, %ecx ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: idivb %sil -; X64-NEXT: movsbl %ah, %ebx +; X64-NEXT: movsbl %ah, %edx ; X64-NEXT: movzbl %al, %edi ; X64-NEXT: leal -1(%rdi), %eax ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: testb %sil, %sil -; X64-NEXT: sets %dl +; X64-NEXT: sets %sil ; X64-NEXT: testb %cl, %cl ; X64-NEXT: sets %cl -; X64-NEXT: xorb %dl, %cl -; X64-NEXT: testb %bl, %bl +; X64-NEXT: xorb %sil, %cl +; X64-NEXT: testb %dl, %dl ; X64-NEXT: setne %dl ; X64-NEXT: testb %cl, %dl ; X64-NEXT: cmovel %edi, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: popq %rbx ; X64-NEXT: retq ; ; X86-LABEL: func4: @@ -264,29 +262,29 @@ ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx ; X64-NEXT: pushq %rax -; X64-NEXT: movq %rsi, %r14 +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %rdi, %r14 ; X64-NEXT: movq %rdi, %r15 -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: sarq $63, %rbx -; X64-NEXT: shldq $31, %rdi, %rbx -; X64-NEXT: shlq $31, %r15 +; X64-NEXT: sarq $63, %r15 +; X64-NEXT: shldq $31, %rdi, %r15 +; X64-NEXT: shlq $31, %r14 ; X64-NEXT: movq %rsi, %r12 ; X64-NEXT: sarq $63, %r12 -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %rbx, %rsi -; X64-NEXT: movq %r14, %rdx +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %r15, %rsi +; X64-NEXT: movq %rbx, %rdx ; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill ; X64-NEXT: leaq -1(%rax), %rbp -; X64-NEXT: testq %rbx, %rbx +; X64-NEXT: testq %r15, %r15 ; X64-NEXT: sets %al ; X64-NEXT: testq %r12, %r12 ; X64-NEXT: sets %r13b ; X64-NEXT: xorb %al, %r13b -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %rbx, %rsi -; X64-NEXT: movq %r14, %rdx +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %r15, %rsi +; X64-NEXT: movq %rbx, %rdx ; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx @@ -456,37 +454,37 @@ ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: cqto ; X64-NEXT: idivq %rcx -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; X64-NEXT: movq %xmm3, %rcx +; X64-NEXT: movq %xmm3, %rdi ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm3, %rax ; X64-NEXT: cqto -; X64-NEXT: idivq %rcx -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: idivq %rdi +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: pxor %xmm3, %xmm3 ; X64-NEXT: pcmpgtd %xmm4, %xmm3 ; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; X64-NEXT: movq %xmm4, %rdi +; X64-NEXT: movq %xmm4, %r9 ; X64-NEXT: pxor %xmm5, %xmm5 ; X64-NEXT: pcmpgtd %xmm1, %xmm5 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; X64-NEXT: psllq $31, %xmm1 ; X64-NEXT: movq %xmm1, %rax ; X64-NEXT: cqto -; X64-NEXT: idivq %rdi +; X64-NEXT: idivq %r9 ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X64-NEXT: movq %xmm4, %rsi +; X64-NEXT: movq %xmm4, %r11 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; X64-NEXT: movq %xmm4, %rax ; X64-NEXT: cqto -; X64-NEXT: idivq %rsi -; X64-NEXT: movq %r11, %xmm4 -; X64-NEXT: movq %rcx, %xmm5 +; X64-NEXT: idivq %r11 +; X64-NEXT: movq %rsi, %xmm4 +; X64-NEXT: movq %r8, %xmm5 ; X64-NEXT: pxor %xmm6, %xmm6 ; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; X64-NEXT: pcmpeqd %xmm6, %xmm4 @@ -498,9 +496,9 @@ ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-NEXT: pxor %xmm2, %xmm2 ; X64-NEXT: pcmpgtd %xmm0, %xmm2 -; X64-NEXT: movq %r8, %xmm0 +; X64-NEXT: movq %rcx, %xmm0 ; X64-NEXT: pxor %xmm4, %xmm2 -; X64-NEXT: movq %r10, %xmm4 +; X64-NEXT: movq %rdi, %xmm4 ; X64-NEXT: pandn %xmm2, %xmm5 ; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; X64-NEXT: movdqa %xmm5, %xmm2 @@ -509,7 +507,7 @@ ; X64-NEXT: paddq %xmm4, %xmm0 ; X64-NEXT: pand %xmm5, %xmm0 ; X64-NEXT: por %xmm2, %xmm0 -; X64-NEXT: movq %rdi, %xmm2 +; X64-NEXT: movq %r10, %xmm2 ; X64-NEXT: movq %rdx, %xmm5 ; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] ; X64-NEXT: pcmpeqd %xmm6, %xmm2 diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -228,7 +228,6 @@ ; ; X64-LABEL: func4: ; X64: # %bb.0: -; X64-NEXT: pushq %rbx ; X64-NEXT: shlb $4, %sil ; X64-NEXT: sarb $4, %sil ; X64-NEXT: shlb $4, %dil @@ -237,16 +236,16 @@ ; X64-NEXT: movsbl %dil, %ecx ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: idivb %sil -; X64-NEXT: movsbl %ah, %ebx +; X64-NEXT: movsbl %ah, %edx ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: leal -1(%rax), %edi ; X64-NEXT: movzbl %dil, %edi ; X64-NEXT: testb %sil, %sil -; X64-NEXT: sets %dl +; X64-NEXT: sets %sil ; X64-NEXT: testb %cl, %cl ; X64-NEXT: sets %cl -; X64-NEXT: xorb %dl, %cl -; X64-NEXT: testb %bl, %bl +; X64-NEXT: xorb %sil, %cl +; X64-NEXT: testb %dl, %dl ; X64-NEXT: setne %dl ; X64-NEXT: testb %cl, %dl ; X64-NEXT: cmovel %eax, %edi @@ -257,7 +256,6 @@ ; X64-NEXT: movl $248, %eax ; X64-NEXT: cmovgel %ecx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: popq %rbx ; X64-NEXT: retq ; ; X86-LABEL: func4: @@ -590,10 +588,10 @@ ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm0, %r15 -; X64-NEXT: movq %r15, %rbp +; X64-NEXT: movq %xmm0, %rbx +; X64-NEXT: movq %rbx, %rbp ; X64-NEXT: sarq $63, %rbp -; X64-NEXT: shldq $31, %r15, %rbp +; X64-NEXT: shldq $31, %rbx, %rbp ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X64-NEXT: pxor %xmm0, %xmm0 ; X64-NEXT: pcmpgtd %xmm1, %xmm0 @@ -601,13 +599,13 @@ ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm1, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: sarq $63, %rbx -; X64-NEXT: movq %r15, %r12 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: sarq $63, %r15 +; X64-NEXT: movq %rbx, %r12 ; X64-NEXT: shlq $31, %r12 ; X64-NEXT: movq %r12, %rdi ; X64-NEXT: movq %rbp, %rsi -; X64-NEXT: movq %rbx, %rcx +; X64-NEXT: movq %r15, %rcx ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -615,16 +613,16 @@ ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r13 ; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %r15 -; X64-NEXT: xorl %ebx, %r15d +; X64-NEXT: shrq $63, %rbx +; X64-NEXT: xorl %r15d, %ebx ; X64-NEXT: movq %r12, %rdi ; X64-NEXT: movq %rbp, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rbx, %rcx +; X64-NEXT: movq %r15, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al -; X64-NEXT: testb %r15b, %al +; X64-NEXT: testb %bl, %al ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF @@ -649,57 +647,57 @@ ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: sarq $63, %r13 -; X64-NEXT: shldq $31, %rbx, %r13 +; X64-NEXT: movq %rbx, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: shldq $31, %rbx, %rbp ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: movq %rbx, %r15 -; X64-NEXT: shlq $31, %r15 -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: sarq $63, %r15 +; X64-NEXT: movq %rbx, %r12 +; X64-NEXT: shlq $31, %r12 +; X64-NEXT: movq %r12, %rdi +; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: movq %r15, %rcx ; X64-NEXT: callq __divti3@PLT -; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %r12 +; X64-NEXT: subq $1, %r13 ; X64-NEXT: sbbq $0, %r14 ; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %ebp, %ebx -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: xorl %r15d, %ebx +; X64-NEXT: movq %r12, %rdi +; X64-NEXT: movq %rbp, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %r15, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al ; X64-NEXT: testb %bl, %al ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rcx, %r12 +; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF -; X64-NEXT: cmovbq %r12, %rax +; X64-NEXT: cmovbq %r13, %rax ; X64-NEXT: testq %r14, %r14 -; X64-NEXT: cmovnsq %rcx, %r12 -; X64-NEXT: cmoveq %rax, %r12 +; X64-NEXT: cmovnsq %rcx, %r13 +; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movl $0, %eax ; X64-NEXT: cmovnsq %rax, %r14 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rcx, %r12 +; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: cmovaq %r12, %rax +; X64-NEXT: cmovaq %r13, %rax ; X64-NEXT: testq %r14, %r14 -; X64-NEXT: cmovsq %rcx, %r12 +; X64-NEXT: cmovsq %rcx, %r13 ; X64-NEXT: cmpq $-1, %r14 -; X64-NEXT: cmoveq %rax, %r12 -; X64-NEXT: movq %r12, %xmm0 +; X64-NEXT: cmoveq %rax, %r13 +; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; X64-NEXT: psrlq $1, %xmm1 @@ -715,9 +713,9 @@ ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: sarq $63, %r13 -; X64-NEXT: shldq $31, %rbx, %r13 +; X64-NEXT: movq %rbx, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: shldq $31, %rbx, %rbp ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; X64-NEXT: pxor %xmm1, %xmm1 ; X64-NEXT: pcmpgtd %xmm0, %xmm1 @@ -725,105 +723,105 @@ ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: movq %rbx, %r15 -; X64-NEXT: shlq $31, %r15 -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: sarq $63, %r15 +; X64-NEXT: movq %rbx, %r12 +; X64-NEXT: shlq $31, %r12 +; X64-NEXT: movq %r12, %rdi +; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: movq %r15, %rcx ; X64-NEXT: callq __divti3@PLT -; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %r12 +; X64-NEXT: subq $1, %r13 ; X64-NEXT: sbbq $0, %r14 ; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %ebp, %ebx -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: xorl %r15d, %ebx +; X64-NEXT: movq %r12, %rdi +; X64-NEXT: movq %rbp, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %r15, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al ; X64-NEXT: testb %bl, %al ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rcx, %r12 +; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF -; X64-NEXT: cmovbq %r12, %rax +; X64-NEXT: cmovbq %r13, %rax ; X64-NEXT: testq %r14, %r14 -; X64-NEXT: cmovnsq %rcx, %r12 -; X64-NEXT: cmoveq %rax, %r12 +; X64-NEXT: cmovnsq %rcx, %r13 +; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movl $0, %eax ; X64-NEXT: cmovnsq %rax, %r14 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rcx, %r12 +; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: cmovaq %r12, %rax +; X64-NEXT: cmovaq %r13, %rax ; X64-NEXT: testq %r14, %r14 -; X64-NEXT: cmovsq %rcx, %r12 +; X64-NEXT: cmovsq %rcx, %r13 ; X64-NEXT: cmpq $-1, %r14 -; X64-NEXT: cmoveq %rax, %r12 -; X64-NEXT: movq %r12, %xmm0 +; X64-NEXT: cmoveq %rax, %r13 +; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: sarq $63, %r13 -; X64-NEXT: shldq $31, %rbx, %r13 +; X64-NEXT: movq %rbx, %rbp +; X64-NEXT: sarq $63, %rbp +; X64-NEXT: shldq $31, %rbx, %rbp ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: movq %rbx, %r15 -; X64-NEXT: shlq $31, %r15 -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: sarq $63, %r15 +; X64-NEXT: movq %rbx, %r12 +; X64-NEXT: shlq $31, %r12 +; X64-NEXT: movq %r12, %rdi +; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: movq %r15, %rcx ; X64-NEXT: callq __divti3@PLT -; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %r12 +; X64-NEXT: subq $1, %r13 ; X64-NEXT: sbbq $0, %r14 ; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %ebp, %ebx -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: xorl %r15d, %ebx +; X64-NEXT: movq %r12, %rdi +; X64-NEXT: movq %rbp, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %r15, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al ; X64-NEXT: testb %bl, %al ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rcx, %r12 +; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF -; X64-NEXT: cmovbq %r12, %rax +; X64-NEXT: cmovbq %r13, %rax ; X64-NEXT: testq %r14, %r14 -; X64-NEXT: cmovnsq %rcx, %r12 -; X64-NEXT: cmoveq %rax, %r12 +; X64-NEXT: cmovnsq %rcx, %r13 +; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movl $0, %eax ; X64-NEXT: cmovnsq %rax, %r14 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rcx, %r12 +; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: cmovaq %r12, %rax +; X64-NEXT: cmovaq %r13, %rax ; X64-NEXT: testq %r14, %r14 -; X64-NEXT: cmovsq %rcx, %r12 +; X64-NEXT: cmovsq %rcx, %r13 ; X64-NEXT: cmpq $-1, %r14 -; X64-NEXT: cmoveq %rax, %r12 -; X64-NEXT: movq %r12, %xmm1 +; X64-NEXT: cmoveq %rax, %r13 +; X64-NEXT: movq %r13, %xmm1 ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: psrlq $1, %xmm0 diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll @@ -80,21 +80,21 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm4, %rcx ; SSE2-NEXT: movq %xmm0, %rdx -; SSE2-NEXT: movq %xmm1, %r8 +; SSE2-NEXT: movq %xmm1, %rsi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdi ; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rsi -; SSE2-NEXT: xorq %rcx, %rsi -; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movq %xmm0, %r8 +; SSE2-NEXT: xorq %rcx, %r8 +; SSE2-NEXT: orq %rdi, %r8 ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: xorq %rdx, %rax ; SSE2-NEXT: movq %xmm3, %rcx -; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: xorq %rsi, %rcx ; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rsi, %rcx +; SSE2-NEXT: orq %r8, %rcx ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -103,19 +103,19 @@ ; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: movq %xmm1, %rcx ; SSE41-NEXT: pextrq $1, %xmm0, %rdx -; SSE41-NEXT: pextrq $1, %xmm1, %r8 +; SSE41-NEXT: pextrq $1, %xmm1, %rsi ; SSE41-NEXT: movq %xmm2, %rdi ; SSE41-NEXT: xorq %rax, %rdi -; SSE41-NEXT: movq %xmm3, %rsi -; SSE41-NEXT: xorq %rcx, %rsi -; SSE41-NEXT: orq %rdi, %rsi +; SSE41-NEXT: movq %xmm3, %r8 +; SSE41-NEXT: xorq %rcx, %r8 +; SSE41-NEXT: orq %rdi, %r8 ; SSE41-NEXT: pextrq $1, %xmm2, %rax ; SSE41-NEXT: xorq %rdx, %rax ; SSE41-NEXT: pextrq $1, %xmm3, %rcx -; SSE41-NEXT: xorq %r8, %rcx +; SSE41-NEXT: xorq %rsi, %rcx ; SSE41-NEXT: orq %rax, %rcx ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rsi, %rcx +; SSE41-NEXT: orq %r8, %rcx ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -160,21 +160,21 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm4, %rcx ; SSE2-NEXT: movq %xmm0, %rdx -; SSE2-NEXT: movq %xmm1, %r8 +; SSE2-NEXT: movq %xmm1, %rsi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdi ; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rsi -; SSE2-NEXT: xorq %rcx, %rsi -; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movq %xmm0, %r8 +; SSE2-NEXT: xorq %rcx, %r8 +; SSE2-NEXT: orq %rdi, %r8 ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: xorq %rdx, %rax ; SSE2-NEXT: movq %xmm3, %rcx -; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: xorq %rsi, %rcx ; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rsi, %rcx +; SSE2-NEXT: orq %r8, %rcx ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -183,19 +183,19 @@ ; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: movq %xmm1, %rcx ; SSE41-NEXT: pextrq $1, %xmm0, %rdx -; SSE41-NEXT: pextrq $1, %xmm1, %r8 +; SSE41-NEXT: pextrq $1, %xmm1, %rsi ; SSE41-NEXT: movq %xmm2, %rdi ; SSE41-NEXT: xorq %rax, %rdi -; SSE41-NEXT: movq %xmm3, %rsi -; SSE41-NEXT: xorq %rcx, %rsi -; SSE41-NEXT: orq %rdi, %rsi +; SSE41-NEXT: movq %xmm3, %r8 +; SSE41-NEXT: xorq %rcx, %r8 +; SSE41-NEXT: orq %rdi, %r8 ; SSE41-NEXT: pextrq $1, %xmm2, %rax ; SSE41-NEXT: xorq %rdx, %rax ; SSE41-NEXT: pextrq $1, %xmm3, %rcx -; SSE41-NEXT: xorq %r8, %rcx +; SSE41-NEXT: xorq %rsi, %rcx ; SSE41-NEXT: orq %rax, %rcx ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rsi, %rcx +; SSE41-NEXT: orq %r8, %rcx ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -242,14 +242,14 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rax -; SSE2-NEXT: movq %xmm0, %r11 +; SSE2-NEXT: movq %xmm8, %r8 +; SSE2-NEXT: movq %xmm0, %r9 ; SSE2-NEXT: movq %xmm2, %r10 -; SSE2-NEXT: movq %xmm1, %r9 -; SSE2-NEXT: movq %xmm3, %r8 +; SSE2-NEXT: movq %xmm1, %rcx +; SSE2-NEXT: movq %xmm3, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: xorq %rdx, %rcx +; SSE2-NEXT: movq %xmm0, %r11 +; SSE2-NEXT: xorq %rdx, %r11 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdx ; SSE2-NEXT: xorq %rsi, %rdx @@ -258,23 +258,23 @@ ; SSE2-NEXT: xorq %rdi, %rsi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdi -; SSE2-NEXT: xorq %rax, %rdi +; SSE2-NEXT: xorq %r8, %rdi ; SSE2-NEXT: orq %rsi, %rdi ; SSE2-NEXT: orq %rdx, %rdi -; SSE2-NEXT: orq %rcx, %rdi -; SSE2-NEXT: movq %xmm4, %rax -; SSE2-NEXT: xorq %r11, %rax -; SSE2-NEXT: movq %xmm6, %rcx -; SSE2-NEXT: xorq %r10, %rcx -; SSE2-NEXT: movq %xmm5, %rdx +; SSE2-NEXT: orq %r11, %rdi +; SSE2-NEXT: movq %xmm4, %rdx ; SSE2-NEXT: xorq %r9, %rdx -; SSE2-NEXT: movq %xmm7, %rsi -; SSE2-NEXT: xorq %r8, %rsi -; SSE2-NEXT: orq %rdx, %rsi -; SSE2-NEXT: orq %rcx, %rsi -; SSE2-NEXT: orq %rax, %rsi +; SSE2-NEXT: movq %xmm6, %rsi +; SSE2-NEXT: xorq %r10, %rsi +; SSE2-NEXT: movq %xmm5, %r8 +; SSE2-NEXT: xorq %rcx, %r8 +; SSE2-NEXT: movq %xmm7, %rcx +; SSE2-NEXT: xorq %rax, %rcx +; SSE2-NEXT: orq %r8, %rcx +; SSE2-NEXT: orq %rsi, %rcx +; SSE2-NEXT: orq %rdx, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: orq %rdi, %rcx ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -284,12 +284,12 @@ ; SSE41-NEXT: movq %xmm2, %rcx ; SSE41-NEXT: movq %xmm1, %rdx ; SSE41-NEXT: movq %xmm3, %rsi -; SSE41-NEXT: pextrq $1, %xmm0, %r11 +; SSE41-NEXT: pextrq $1, %xmm0, %rdi ; SSE41-NEXT: pextrq $1, %xmm2, %r8 ; SSE41-NEXT: pextrq $1, %xmm1, %r9 ; SSE41-NEXT: pextrq $1, %xmm3, %r10 -; SSE41-NEXT: movq %xmm4, %rdi -; SSE41-NEXT: xorq %rax, %rdi +; SSE41-NEXT: movq %xmm4, %r11 +; SSE41-NEXT: xorq %rax, %r11 ; SSE41-NEXT: movq %xmm6, %rax ; SSE41-NEXT: xorq %rcx, %rax ; SSE41-NEXT: movq %xmm5, %rcx @@ -298,9 +298,9 @@ ; SSE41-NEXT: xorq %rsi, %rdx ; SSE41-NEXT: orq %rcx, %rdx ; SSE41-NEXT: orq %rax, %rdx -; SSE41-NEXT: orq %rdi, %rdx +; SSE41-NEXT: orq %r11, %rdx ; SSE41-NEXT: pextrq $1, %xmm4, %rax -; SSE41-NEXT: xorq %r11, %rax +; SSE41-NEXT: xorq %rdi, %rax ; SSE41-NEXT: pextrq $1, %xmm6, %rcx ; SSE41-NEXT: xorq %r8, %rcx ; SSE41-NEXT: pextrq $1, %xmm5, %rsi @@ -322,13 +322,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vmovq %xmm4, %rdi ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vmovq %xmm5, %rax -; AVX1-NEXT: vpextrq $1, %xmm0, %r11 +; AVX1-NEXT: vmovq %xmm5, %r8 +; AVX1-NEXT: vpextrq $1, %xmm0, %r9 ; AVX1-NEXT: vpextrq $1, %xmm1, %r10 -; AVX1-NEXT: vpextrq $1, %xmm4, %r9 -; AVX1-NEXT: vpextrq $1, %xmm5, %r8 -; AVX1-NEXT: vmovq %xmm2, %rcx -; AVX1-NEXT: xorq %rdx, %rcx +; AVX1-NEXT: vpextrq $1, %xmm4, %rcx +; AVX1-NEXT: vpextrq $1, %xmm5, %rax +; AVX1-NEXT: vmovq %xmm2, %r11 +; AVX1-NEXT: xorq %rdx, %r11 ; AVX1-NEXT: vmovq %xmm3, %rdx ; AVX1-NEXT: xorq %rsi, %rdx ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 @@ -336,23 +336,23 @@ ; AVX1-NEXT: xorq %rdi, %rsi ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rdi -; AVX1-NEXT: xorq %rax, %rdi +; AVX1-NEXT: xorq %r8, %rdi ; AVX1-NEXT: orq %rsi, %rdi ; AVX1-NEXT: orq %rdx, %rdi -; AVX1-NEXT: orq %rcx, %rdi -; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: xorq %r11, %rax -; AVX1-NEXT: vpextrq $1, %xmm3, %rcx -; AVX1-NEXT: xorq %r10, %rcx -; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: orq %r11, %rdi +; AVX1-NEXT: vpextrq $1, %xmm2, %rdx ; AVX1-NEXT: xorq %r9, %rdx -; AVX1-NEXT: vpextrq $1, %xmm1, %rsi -; AVX1-NEXT: xorq %r8, %rsi -; AVX1-NEXT: orq %rdx, %rsi -; AVX1-NEXT: orq %rcx, %rsi -; AVX1-NEXT: orq %rax, %rsi +; AVX1-NEXT: vpextrq $1, %xmm3, %rsi +; AVX1-NEXT: xorq %r10, %rsi +; AVX1-NEXT: vpextrq $1, %xmm0, %r8 +; AVX1-NEXT: xorq %rcx, %r8 +; AVX1-NEXT: vpextrq $1, %xmm1, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: orq %r8, %rcx +; AVX1-NEXT: orq %rsi, %rcx +; AVX1-NEXT: orq %rdx, %rcx ; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: orq %rdi, %rsi +; AVX1-NEXT: orq %rdi, %rcx ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -364,13 +364,13 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vmovq %xmm4, %rdi ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vmovq %xmm5, %rax -; AVX2-NEXT: vpextrq $1, %xmm0, %r11 +; AVX2-NEXT: vmovq %xmm5, %r8 +; AVX2-NEXT: vpextrq $1, %xmm0, %r9 ; AVX2-NEXT: vpextrq $1, %xmm1, %r10 -; AVX2-NEXT: vpextrq $1, %xmm4, %r9 -; AVX2-NEXT: vpextrq $1, %xmm5, %r8 -; AVX2-NEXT: vmovq %xmm2, %rcx -; AVX2-NEXT: xorq %rdx, %rcx +; AVX2-NEXT: vpextrq $1, %xmm4, %rcx +; AVX2-NEXT: vpextrq $1, %xmm5, %rax +; AVX2-NEXT: vmovq %xmm2, %r11 +; AVX2-NEXT: xorq %rdx, %r11 ; AVX2-NEXT: vmovq %xmm3, %rdx ; AVX2-NEXT: xorq %rsi, %rdx ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 @@ -378,23 +378,23 @@ ; AVX2-NEXT: xorq %rdi, %rsi ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 ; AVX2-NEXT: vmovq %xmm1, %rdi -; AVX2-NEXT: xorq %rax, %rdi +; AVX2-NEXT: xorq %r8, %rdi ; AVX2-NEXT: orq %rsi, %rdi ; AVX2-NEXT: orq %rdx, %rdi -; AVX2-NEXT: orq %rcx, %rdi -; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: xorq %r11, %rax -; AVX2-NEXT: vpextrq $1, %xmm3, %rcx -; AVX2-NEXT: xorq %r10, %rcx -; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: orq %r11, %rdi +; AVX2-NEXT: vpextrq $1, %xmm2, %rdx ; AVX2-NEXT: xorq %r9, %rdx -; AVX2-NEXT: vpextrq $1, %xmm1, %rsi -; AVX2-NEXT: xorq %r8, %rsi -; AVX2-NEXT: orq %rdx, %rsi -; AVX2-NEXT: orq %rcx, %rsi -; AVX2-NEXT: orq %rax, %rsi +; AVX2-NEXT: vpextrq $1, %xmm3, %rsi +; AVX2-NEXT: xorq %r10, %rsi +; AVX2-NEXT: vpextrq $1, %xmm0, %r8 +; AVX2-NEXT: xorq %rcx, %r8 +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: xorq %rax, %rcx +; AVX2-NEXT: orq %r8, %rcx +; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: orq %rdx, %rcx ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdi, %rsi +; AVX2-NEXT: orq %rdi, %rcx ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -433,14 +433,14 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm8, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rax -; SSE2-NEXT: movq %xmm0, %r11 +; SSE2-NEXT: movq %xmm8, %r8 +; SSE2-NEXT: movq %xmm0, %r9 ; SSE2-NEXT: movq %xmm2, %r10 -; SSE2-NEXT: movq %xmm1, %r9 -; SSE2-NEXT: movq %xmm3, %r8 +; SSE2-NEXT: movq %xmm1, %rcx +; SSE2-NEXT: movq %xmm3, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: xorq %rdx, %rcx +; SSE2-NEXT: movq %xmm0, %r11 +; SSE2-NEXT: xorq %rdx, %r11 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdx ; SSE2-NEXT: xorq %rsi, %rdx @@ -449,23 +449,23 @@ ; SSE2-NEXT: xorq %rdi, %rsi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdi -; SSE2-NEXT: xorq %rax, %rdi +; SSE2-NEXT: xorq %r8, %rdi ; SSE2-NEXT: orq %rsi, %rdi ; SSE2-NEXT: orq %rdx, %rdi -; SSE2-NEXT: orq %rcx, %rdi -; SSE2-NEXT: movq %xmm4, %rax -; SSE2-NEXT: xorq %r11, %rax -; SSE2-NEXT: movq %xmm6, %rcx -; SSE2-NEXT: xorq %r10, %rcx -; SSE2-NEXT: movq %xmm5, %rdx +; SSE2-NEXT: orq %r11, %rdi +; SSE2-NEXT: movq %xmm4, %rdx ; SSE2-NEXT: xorq %r9, %rdx -; SSE2-NEXT: movq %xmm7, %rsi -; SSE2-NEXT: xorq %r8, %rsi -; SSE2-NEXT: orq %rdx, %rsi -; SSE2-NEXT: orq %rcx, %rsi -; SSE2-NEXT: orq %rax, %rsi +; SSE2-NEXT: movq %xmm6, %rsi +; SSE2-NEXT: xorq %r10, %rsi +; SSE2-NEXT: movq %xmm5, %r8 +; SSE2-NEXT: xorq %rcx, %r8 +; SSE2-NEXT: movq %xmm7, %rcx +; SSE2-NEXT: xorq %rax, %rcx +; SSE2-NEXT: orq %r8, %rcx +; SSE2-NEXT: orq %rsi, %rcx +; SSE2-NEXT: orq %rdx, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: orq %rdi, %rcx ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -475,12 +475,12 @@ ; SSE41-NEXT: movq %xmm2, %rcx ; SSE41-NEXT: movq %xmm1, %rdx ; SSE41-NEXT: movq %xmm3, %rsi -; SSE41-NEXT: pextrq $1, %xmm0, %r11 +; SSE41-NEXT: pextrq $1, %xmm0, %rdi ; SSE41-NEXT: pextrq $1, %xmm2, %r8 ; SSE41-NEXT: pextrq $1, %xmm1, %r9 ; SSE41-NEXT: pextrq $1, %xmm3, %r10 -; SSE41-NEXT: movq %xmm4, %rdi -; SSE41-NEXT: xorq %rax, %rdi +; SSE41-NEXT: movq %xmm4, %r11 +; SSE41-NEXT: xorq %rax, %r11 ; SSE41-NEXT: movq %xmm6, %rax ; SSE41-NEXT: xorq %rcx, %rax ; SSE41-NEXT: movq %xmm5, %rcx @@ -489,9 +489,9 @@ ; SSE41-NEXT: xorq %rsi, %rdx ; SSE41-NEXT: orq %rcx, %rdx ; SSE41-NEXT: orq %rax, %rdx -; SSE41-NEXT: orq %rdi, %rdx +; SSE41-NEXT: orq %r11, %rdx ; SSE41-NEXT: pextrq $1, %xmm4, %rax -; SSE41-NEXT: xorq %r11, %rax +; SSE41-NEXT: xorq %rdi, %rax ; SSE41-NEXT: pextrq $1, %xmm6, %rcx ; SSE41-NEXT: xorq %r8, %rcx ; SSE41-NEXT: pextrq $1, %xmm5, %rsi @@ -513,13 +513,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vmovq %xmm4, %rdi ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vmovq %xmm5, %rax -; AVX1-NEXT: vpextrq $1, %xmm0, %r11 +; AVX1-NEXT: vmovq %xmm5, %r8 +; AVX1-NEXT: vpextrq $1, %xmm0, %r9 ; AVX1-NEXT: vpextrq $1, %xmm1, %r10 -; AVX1-NEXT: vpextrq $1, %xmm4, %r9 -; AVX1-NEXT: vpextrq $1, %xmm5, %r8 -; AVX1-NEXT: vmovq %xmm2, %rcx -; AVX1-NEXT: xorq %rdx, %rcx +; AVX1-NEXT: vpextrq $1, %xmm4, %rcx +; AVX1-NEXT: vpextrq $1, %xmm5, %rax +; AVX1-NEXT: vmovq %xmm2, %r11 +; AVX1-NEXT: xorq %rdx, %r11 ; AVX1-NEXT: vmovq %xmm3, %rdx ; AVX1-NEXT: xorq %rsi, %rdx ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 @@ -527,23 +527,23 @@ ; AVX1-NEXT: xorq %rdi, %rsi ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rdi -; AVX1-NEXT: xorq %rax, %rdi +; AVX1-NEXT: xorq %r8, %rdi ; AVX1-NEXT: orq %rsi, %rdi ; AVX1-NEXT: orq %rdx, %rdi -; AVX1-NEXT: orq %rcx, %rdi -; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: xorq %r11, %rax -; AVX1-NEXT: vpextrq $1, %xmm3, %rcx -; AVX1-NEXT: xorq %r10, %rcx -; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: orq %r11, %rdi +; AVX1-NEXT: vpextrq $1, %xmm2, %rdx ; AVX1-NEXT: xorq %r9, %rdx -; AVX1-NEXT: vpextrq $1, %xmm1, %rsi -; AVX1-NEXT: xorq %r8, %rsi -; AVX1-NEXT: orq %rdx, %rsi -; AVX1-NEXT: orq %rcx, %rsi -; AVX1-NEXT: orq %rax, %rsi +; AVX1-NEXT: vpextrq $1, %xmm3, %rsi +; AVX1-NEXT: xorq %r10, %rsi +; AVX1-NEXT: vpextrq $1, %xmm0, %r8 +; AVX1-NEXT: xorq %rcx, %r8 +; AVX1-NEXT: vpextrq $1, %xmm1, %rcx +; AVX1-NEXT: xorq %rax, %rcx +; AVX1-NEXT: orq %r8, %rcx +; AVX1-NEXT: orq %rsi, %rcx +; AVX1-NEXT: orq %rdx, %rcx ; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: orq %rdi, %rsi +; AVX1-NEXT: orq %rdi, %rcx ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -555,13 +555,13 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vmovq %xmm4, %rdi ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vmovq %xmm5, %rax -; AVX2-NEXT: vpextrq $1, %xmm0, %r11 +; AVX2-NEXT: vmovq %xmm5, %r8 +; AVX2-NEXT: vpextrq $1, %xmm0, %r9 ; AVX2-NEXT: vpextrq $1, %xmm1, %r10 -; AVX2-NEXT: vpextrq $1, %xmm4, %r9 -; AVX2-NEXT: vpextrq $1, %xmm5, %r8 -; AVX2-NEXT: vmovq %xmm2, %rcx -; AVX2-NEXT: xorq %rdx, %rcx +; AVX2-NEXT: vpextrq $1, %xmm4, %rcx +; AVX2-NEXT: vpextrq $1, %xmm5, %rax +; AVX2-NEXT: vmovq %xmm2, %r11 +; AVX2-NEXT: xorq %rdx, %r11 ; AVX2-NEXT: vmovq %xmm3, %rdx ; AVX2-NEXT: xorq %rsi, %rdx ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 @@ -569,23 +569,23 @@ ; AVX2-NEXT: xorq %rdi, %rsi ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 ; AVX2-NEXT: vmovq %xmm1, %rdi -; AVX2-NEXT: xorq %rax, %rdi +; AVX2-NEXT: xorq %r8, %rdi ; AVX2-NEXT: orq %rsi, %rdi ; AVX2-NEXT: orq %rdx, %rdi -; AVX2-NEXT: orq %rcx, %rdi -; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: xorq %r11, %rax -; AVX2-NEXT: vpextrq $1, %xmm3, %rcx -; AVX2-NEXT: xorq %r10, %rcx -; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: orq %r11, %rdi +; AVX2-NEXT: vpextrq $1, %xmm2, %rdx ; AVX2-NEXT: xorq %r9, %rdx -; AVX2-NEXT: vpextrq $1, %xmm1, %rsi -; AVX2-NEXT: xorq %r8, %rsi -; AVX2-NEXT: orq %rdx, %rsi -; AVX2-NEXT: orq %rcx, %rsi -; AVX2-NEXT: orq %rax, %rsi +; AVX2-NEXT: vpextrq $1, %xmm3, %rsi +; AVX2-NEXT: xorq %r10, %rsi +; AVX2-NEXT: vpextrq $1, %xmm0, %r8 +; AVX2-NEXT: xorq %rcx, %r8 +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: xorq %rax, %rcx +; AVX2-NEXT: orq %r8, %rcx +; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: orq %rdx, %rcx ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdi, %rsi +; AVX2-NEXT: orq %rdi, %rcx ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -736,59 +736,59 @@ define i32 @ne_i256_pair(ptr %a, ptr %b) { ; SSE2-LABEL: ne_i256_pair: ; SSE2: # %bb.0: -; SSE2-NEXT: movq 16(%rdi), %r9 -; SSE2-NEXT: movq 24(%rdi), %r11 -; SSE2-NEXT: movq (%rdi), %r8 -; SSE2-NEXT: movq 8(%rdi), %r10 -; SSE2-NEXT: xorq 8(%rsi), %r10 -; SSE2-NEXT: xorq 24(%rsi), %r11 -; SSE2-NEXT: xorq (%rsi), %r8 -; SSE2-NEXT: xorq 16(%rsi), %r9 -; SSE2-NEXT: movq 48(%rdi), %rcx -; SSE2-NEXT: movq 32(%rdi), %rax -; SSE2-NEXT: movq 56(%rdi), %rdx +; SSE2-NEXT: movq 16(%rdi), %rcx +; SSE2-NEXT: movq 24(%rdi), %rdx +; SSE2-NEXT: movq (%rdi), %rax +; SSE2-NEXT: movq 8(%rdi), %r8 +; SSE2-NEXT: xorq 8(%rsi), %r8 +; SSE2-NEXT: xorq 24(%rsi), %rdx +; SSE2-NEXT: xorq (%rsi), %rax +; SSE2-NEXT: xorq 16(%rsi), %rcx +; SSE2-NEXT: movq 48(%rdi), %r9 +; SSE2-NEXT: movq 32(%rdi), %r10 +; SSE2-NEXT: movq 56(%rdi), %r11 ; SSE2-NEXT: movq 40(%rdi), %rdi ; SSE2-NEXT: xorq 40(%rsi), %rdi -; SSE2-NEXT: xorq 56(%rsi), %rdx -; SSE2-NEXT: orq %r11, %rdx -; SSE2-NEXT: orq %rdi, %rdx -; SSE2-NEXT: orq %r10, %rdx -; SSE2-NEXT: xorq 32(%rsi), %rax -; SSE2-NEXT: xorq 48(%rsi), %rcx -; SSE2-NEXT: orq %r9, %rcx -; SSE2-NEXT: orq %rax, %rcx -; SSE2-NEXT: orq %r8, %rcx +; SSE2-NEXT: xorq 56(%rsi), %r11 +; SSE2-NEXT: orq %rdx, %r11 +; SSE2-NEXT: orq %rdi, %r11 +; SSE2-NEXT: orq %r8, %r11 +; SSE2-NEXT: xorq 32(%rsi), %r10 +; SSE2-NEXT: xorq 48(%rsi), %r9 +; SSE2-NEXT: orq %rcx, %r9 +; SSE2-NEXT: orq %r10, %r9 +; SSE2-NEXT: orq %rax, %r9 ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdx, %rcx +; SSE2-NEXT: orq %r11, %r9 ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: ne_i256_pair: ; SSE41: # %bb.0: -; SSE41-NEXT: movq 16(%rdi), %r9 -; SSE41-NEXT: movq 24(%rdi), %r11 -; SSE41-NEXT: movq (%rdi), %r8 -; SSE41-NEXT: movq 8(%rdi), %r10 -; SSE41-NEXT: xorq 8(%rsi), %r10 -; SSE41-NEXT: xorq 24(%rsi), %r11 -; SSE41-NEXT: xorq (%rsi), %r8 -; SSE41-NEXT: xorq 16(%rsi), %r9 -; SSE41-NEXT: movq 48(%rdi), %rcx -; SSE41-NEXT: movq 32(%rdi), %rax -; SSE41-NEXT: movq 56(%rdi), %rdx +; SSE41-NEXT: movq 16(%rdi), %rcx +; SSE41-NEXT: movq 24(%rdi), %rdx +; SSE41-NEXT: movq (%rdi), %rax +; SSE41-NEXT: movq 8(%rdi), %r8 +; SSE41-NEXT: xorq 8(%rsi), %r8 +; SSE41-NEXT: xorq 24(%rsi), %rdx +; SSE41-NEXT: xorq (%rsi), %rax +; SSE41-NEXT: xorq 16(%rsi), %rcx +; SSE41-NEXT: movq 48(%rdi), %r9 +; SSE41-NEXT: movq 32(%rdi), %r10 +; SSE41-NEXT: movq 56(%rdi), %r11 ; SSE41-NEXT: movq 40(%rdi), %rdi ; SSE41-NEXT: xorq 40(%rsi), %rdi -; SSE41-NEXT: xorq 56(%rsi), %rdx -; SSE41-NEXT: orq %r11, %rdx -; SSE41-NEXT: orq %rdi, %rdx -; SSE41-NEXT: orq %r10, %rdx -; SSE41-NEXT: xorq 32(%rsi), %rax -; SSE41-NEXT: xorq 48(%rsi), %rcx -; SSE41-NEXT: orq %r9, %rcx -; SSE41-NEXT: orq %rax, %rcx -; SSE41-NEXT: orq %r8, %rcx +; SSE41-NEXT: xorq 56(%rsi), %r11 +; SSE41-NEXT: orq %rdx, %r11 +; SSE41-NEXT: orq %rdi, %r11 +; SSE41-NEXT: orq %r8, %r11 +; SSE41-NEXT: xorq 32(%rsi), %r10 +; SSE41-NEXT: xorq 48(%rsi), %r9 +; SSE41-NEXT: orq %rcx, %r9 +; SSE41-NEXT: orq %r10, %r9 +; SSE41-NEXT: orq %rax, %r9 ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rdx, %rcx +; SSE41-NEXT: orq %r11, %r9 ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -850,59 +850,59 @@ define i32 @eq_i256_pair(ptr %a, ptr %b) { ; SSE2-LABEL: eq_i256_pair: ; SSE2: # %bb.0: -; SSE2-NEXT: movq 16(%rdi), %r9 -; SSE2-NEXT: movq 24(%rdi), %r11 -; SSE2-NEXT: movq (%rdi), %r8 -; SSE2-NEXT: movq 8(%rdi), %r10 -; SSE2-NEXT: xorq 8(%rsi), %r10 -; SSE2-NEXT: xorq 24(%rsi), %r11 -; SSE2-NEXT: xorq (%rsi), %r8 -; SSE2-NEXT: xorq 16(%rsi), %r9 -; SSE2-NEXT: movq 48(%rdi), %rcx -; SSE2-NEXT: movq 32(%rdi), %rax -; SSE2-NEXT: movq 56(%rdi), %rdx +; SSE2-NEXT: movq 16(%rdi), %rcx +; SSE2-NEXT: movq 24(%rdi), %rdx +; SSE2-NEXT: movq (%rdi), %rax +; SSE2-NEXT: movq 8(%rdi), %r8 +; SSE2-NEXT: xorq 8(%rsi), %r8 +; SSE2-NEXT: xorq 24(%rsi), %rdx +; SSE2-NEXT: xorq (%rsi), %rax +; SSE2-NEXT: xorq 16(%rsi), %rcx +; SSE2-NEXT: movq 48(%rdi), %r9 +; SSE2-NEXT: movq 32(%rdi), %r10 +; SSE2-NEXT: movq 56(%rdi), %r11 ; SSE2-NEXT: movq 40(%rdi), %rdi ; SSE2-NEXT: xorq 40(%rsi), %rdi -; SSE2-NEXT: xorq 56(%rsi), %rdx -; SSE2-NEXT: orq %r11, %rdx -; SSE2-NEXT: orq %rdi, %rdx -; SSE2-NEXT: orq %r10, %rdx -; SSE2-NEXT: xorq 32(%rsi), %rax -; SSE2-NEXT: xorq 48(%rsi), %rcx -; SSE2-NEXT: orq %r9, %rcx -; SSE2-NEXT: orq %rax, %rcx -; SSE2-NEXT: orq %r8, %rcx +; SSE2-NEXT: xorq 56(%rsi), %r11 +; SSE2-NEXT: orq %rdx, %r11 +; SSE2-NEXT: orq %rdi, %r11 +; SSE2-NEXT: orq %r8, %r11 +; SSE2-NEXT: xorq 32(%rsi), %r10 +; SSE2-NEXT: xorq 48(%rsi), %r9 +; SSE2-NEXT: orq %rcx, %r9 +; SSE2-NEXT: orq %r10, %r9 +; SSE2-NEXT: orq %rax, %r9 ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdx, %rcx +; SSE2-NEXT: orq %r11, %r9 ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: eq_i256_pair: ; SSE41: # %bb.0: -; SSE41-NEXT: movq 16(%rdi), %r9 -; SSE41-NEXT: movq 24(%rdi), %r11 -; SSE41-NEXT: movq (%rdi), %r8 -; SSE41-NEXT: movq 8(%rdi), %r10 -; SSE41-NEXT: xorq 8(%rsi), %r10 -; SSE41-NEXT: xorq 24(%rsi), %r11 -; SSE41-NEXT: xorq (%rsi), %r8 -; SSE41-NEXT: xorq 16(%rsi), %r9 -; SSE41-NEXT: movq 48(%rdi), %rcx -; SSE41-NEXT: movq 32(%rdi), %rax -; SSE41-NEXT: movq 56(%rdi), %rdx +; SSE41-NEXT: movq 16(%rdi), %rcx +; SSE41-NEXT: movq 24(%rdi), %rdx +; SSE41-NEXT: movq (%rdi), %rax +; SSE41-NEXT: movq 8(%rdi), %r8 +; SSE41-NEXT: xorq 8(%rsi), %r8 +; SSE41-NEXT: xorq 24(%rsi), %rdx +; SSE41-NEXT: xorq (%rsi), %rax +; SSE41-NEXT: xorq 16(%rsi), %rcx +; SSE41-NEXT: movq 48(%rdi), %r9 +; SSE41-NEXT: movq 32(%rdi), %r10 +; SSE41-NEXT: movq 56(%rdi), %r11 ; SSE41-NEXT: movq 40(%rdi), %rdi ; SSE41-NEXT: xorq 40(%rsi), %rdi -; SSE41-NEXT: xorq 56(%rsi), %rdx -; SSE41-NEXT: orq %r11, %rdx -; SSE41-NEXT: orq %rdi, %rdx -; SSE41-NEXT: orq %r10, %rdx -; SSE41-NEXT: xorq 32(%rsi), %rax -; SSE41-NEXT: xorq 48(%rsi), %rcx -; SSE41-NEXT: orq %r9, %rcx -; SSE41-NEXT: orq %rax, %rcx -; SSE41-NEXT: orq %r8, %rcx +; SSE41-NEXT: xorq 56(%rsi), %r11 +; SSE41-NEXT: orq %rdx, %r11 +; SSE41-NEXT: orq %rdi, %r11 +; SSE41-NEXT: orq %r8, %r11 +; SSE41-NEXT: xorq 32(%rsi), %r10 +; SSE41-NEXT: xorq 48(%rsi), %r9 +; SSE41-NEXT: orq %rcx, %r9 +; SSE41-NEXT: orq %r10, %r9 +; SSE41-NEXT: orq %rax, %r9 ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rdx, %rcx +; SSE41-NEXT: orq %r11, %r9 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -964,54 +964,54 @@ define i32 @ne_i512_pair(ptr %a, ptr %b) { ; NO512-LABEL: ne_i512_pair: ; NO512: # %bb.0: -; NO512-NEXT: movq 32(%rdi), %r8 -; NO512-NEXT: movq 48(%rdi), %r9 +; NO512-NEXT: movq 32(%rdi), %rax +; NO512-NEXT: movq 48(%rdi), %rcx ; NO512-NEXT: movq 40(%rdi), %rdx -; NO512-NEXT: movq 56(%rdi), %rcx -; NO512-NEXT: xorq 56(%rsi), %rcx -; NO512-NEXT: movq 120(%rdi), %rax -; NO512-NEXT: xorq 120(%rsi), %rax -; NO512-NEXT: orq %rcx, %rax -; NO512-NEXT: movq 88(%rdi), %rcx -; NO512-NEXT: xorq 88(%rsi), %rcx -; NO512-NEXT: orq %rcx, %rax -; NO512-NEXT: movq 24(%rdi), %rcx -; NO512-NEXT: xorq 24(%rsi), %rcx +; NO512-NEXT: movq 56(%rdi), %r8 +; NO512-NEXT: xorq 56(%rsi), %r8 +; NO512-NEXT: movq 120(%rdi), %r9 +; NO512-NEXT: xorq 120(%rsi), %r9 +; NO512-NEXT: orq %r8, %r9 +; NO512-NEXT: movq 88(%rdi), %r8 +; NO512-NEXT: xorq 88(%rsi), %r8 +; NO512-NEXT: orq %r8, %r9 +; NO512-NEXT: movq 24(%rdi), %r8 +; NO512-NEXT: xorq 24(%rsi), %r8 ; NO512-NEXT: xorq 40(%rsi), %rdx -; NO512-NEXT: orq %rcx, %rax -; NO512-NEXT: movq 104(%rdi), %rcx -; NO512-NEXT: xorq 104(%rsi), %rcx -; NO512-NEXT: orq %rdx, %rcx +; NO512-NEXT: orq %r8, %r9 +; NO512-NEXT: movq 104(%rdi), %r8 +; NO512-NEXT: xorq 104(%rsi), %r8 +; NO512-NEXT: orq %rdx, %r8 ; NO512-NEXT: movq 72(%rdi), %rdx ; NO512-NEXT: xorq 72(%rsi), %rdx -; NO512-NEXT: orq %rdx, %rcx -; NO512-NEXT: movq 16(%rdi), %r10 -; NO512-NEXT: orq %rax, %rcx -; NO512-NEXT: movq 8(%rdi), %rax -; NO512-NEXT: xorq 8(%rsi), %rax -; NO512-NEXT: xorq 48(%rsi), %r9 -; NO512-NEXT: orq %rax, %rcx -; NO512-NEXT: movq 112(%rdi), %rax -; NO512-NEXT: xorq 112(%rsi), %rax -; NO512-NEXT: orq %r9, %rax -; NO512-NEXT: movq 80(%rdi), %rdx -; NO512-NEXT: xorq 80(%rsi), %rdx -; NO512-NEXT: orq %rdx, %rax -; NO512-NEXT: movq (%rdi), %r9 -; NO512-NEXT: xorq 16(%rsi), %r10 -; NO512-NEXT: xorq (%rsi), %r9 -; NO512-NEXT: xorq 32(%rsi), %r8 -; NO512-NEXT: orq %r10, %rax +; NO512-NEXT: orq %rdx, %r8 +; NO512-NEXT: movq 16(%rdi), %rdx +; NO512-NEXT: orq %r9, %r8 +; NO512-NEXT: movq 8(%rdi), %r9 +; NO512-NEXT: xorq 8(%rsi), %r9 +; NO512-NEXT: xorq 48(%rsi), %rcx +; NO512-NEXT: orq %r9, %r8 +; NO512-NEXT: movq 112(%rdi), %r9 +; NO512-NEXT: xorq 112(%rsi), %r9 +; NO512-NEXT: orq %rcx, %r9 +; NO512-NEXT: movq 80(%rdi), %rcx +; NO512-NEXT: xorq 80(%rsi), %rcx +; NO512-NEXT: orq %rcx, %r9 +; NO512-NEXT: movq (%rdi), %rcx +; NO512-NEXT: xorq 16(%rsi), %rdx +; NO512-NEXT: xorq (%rsi), %rcx +; NO512-NEXT: xorq 32(%rsi), %rax +; NO512-NEXT: orq %rdx, %r9 ; NO512-NEXT: movq 96(%rdi), %rdx ; NO512-NEXT: movq 64(%rdi), %rdi ; NO512-NEXT: xorq 64(%rsi), %rdi ; NO512-NEXT: xorq 96(%rsi), %rdx -; NO512-NEXT: orq %r8, %rdx -; NO512-NEXT: orq %rdi, %rdx ; NO512-NEXT: orq %rax, %rdx +; NO512-NEXT: orq %rdi, %rdx ; NO512-NEXT: orq %r9, %rdx -; NO512-NEXT: xorl %eax, %eax ; NO512-NEXT: orq %rcx, %rdx +; NO512-NEXT: xorl %eax, %eax +; NO512-NEXT: orq %r8, %rdx ; NO512-NEXT: setne %al ; NO512-NEXT: retq ; @@ -1058,54 +1058,54 @@ define i32 @eq_i512_pair(ptr %a, ptr %b) { ; NO512-LABEL: eq_i512_pair: ; NO512: # %bb.0: -; NO512-NEXT: movq 32(%rdi), %r8 -; NO512-NEXT: movq 48(%rdi), %r9 +; NO512-NEXT: movq 32(%rdi), %rax +; NO512-NEXT: movq 48(%rdi), %rcx ; NO512-NEXT: movq 40(%rdi), %rdx -; NO512-NEXT: movq 56(%rdi), %rcx -; NO512-NEXT: xorq 56(%rsi), %rcx -; NO512-NEXT: movq 120(%rdi), %rax -; NO512-NEXT: xorq 120(%rsi), %rax -; NO512-NEXT: orq %rcx, %rax -; NO512-NEXT: movq 88(%rdi), %rcx -; NO512-NEXT: xorq 88(%rsi), %rcx -; NO512-NEXT: orq %rcx, %rax -; NO512-NEXT: movq 24(%rdi), %rcx -; NO512-NEXT: xorq 24(%rsi), %rcx +; NO512-NEXT: movq 56(%rdi), %r8 +; NO512-NEXT: xorq 56(%rsi), %r8 +; NO512-NEXT: movq 120(%rdi), %r9 +; NO512-NEXT: xorq 120(%rsi), %r9 +; NO512-NEXT: orq %r8, %r9 +; NO512-NEXT: movq 88(%rdi), %r8 +; NO512-NEXT: xorq 88(%rsi), %r8 +; NO512-NEXT: orq %r8, %r9 +; NO512-NEXT: movq 24(%rdi), %r8 +; NO512-NEXT: xorq 24(%rsi), %r8 ; NO512-NEXT: xorq 40(%rsi), %rdx -; NO512-NEXT: orq %rcx, %rax -; NO512-NEXT: movq 104(%rdi), %rcx -; NO512-NEXT: xorq 104(%rsi), %rcx -; NO512-NEXT: orq %rdx, %rcx +; NO512-NEXT: orq %r8, %r9 +; NO512-NEXT: movq 104(%rdi), %r8 +; NO512-NEXT: xorq 104(%rsi), %r8 +; NO512-NEXT: orq %rdx, %r8 ; NO512-NEXT: movq 72(%rdi), %rdx ; NO512-NEXT: xorq 72(%rsi), %rdx -; NO512-NEXT: orq %rdx, %rcx -; NO512-NEXT: movq 16(%rdi), %r10 -; NO512-NEXT: orq %rax, %rcx -; NO512-NEXT: movq 8(%rdi), %rax -; NO512-NEXT: xorq 8(%rsi), %rax -; NO512-NEXT: xorq 48(%rsi), %r9 -; NO512-NEXT: orq %rax, %rcx -; NO512-NEXT: movq 112(%rdi), %rax -; NO512-NEXT: xorq 112(%rsi), %rax -; NO512-NEXT: orq %r9, %rax -; NO512-NEXT: movq 80(%rdi), %rdx -; NO512-NEXT: xorq 80(%rsi), %rdx -; NO512-NEXT: orq %rdx, %rax -; NO512-NEXT: movq (%rdi), %r9 -; NO512-NEXT: xorq 16(%rsi), %r10 -; NO512-NEXT: xorq (%rsi), %r9 -; NO512-NEXT: xorq 32(%rsi), %r8 -; NO512-NEXT: orq %r10, %rax +; NO512-NEXT: orq %rdx, %r8 +; NO512-NEXT: movq 16(%rdi), %rdx +; NO512-NEXT: orq %r9, %r8 +; NO512-NEXT: movq 8(%rdi), %r9 +; NO512-NEXT: xorq 8(%rsi), %r9 +; NO512-NEXT: xorq 48(%rsi), %rcx +; NO512-NEXT: orq %r9, %r8 +; NO512-NEXT: movq 112(%rdi), %r9 +; NO512-NEXT: xorq 112(%rsi), %r9 +; NO512-NEXT: orq %rcx, %r9 +; NO512-NEXT: movq 80(%rdi), %rcx +; NO512-NEXT: xorq 80(%rsi), %rcx +; NO512-NEXT: orq %rcx, %r9 +; NO512-NEXT: movq (%rdi), %rcx +; NO512-NEXT: xorq 16(%rsi), %rdx +; NO512-NEXT: xorq (%rsi), %rcx +; NO512-NEXT: xorq 32(%rsi), %rax +; NO512-NEXT: orq %rdx, %r9 ; NO512-NEXT: movq 96(%rdi), %rdx ; NO512-NEXT: movq 64(%rdi), %rdi ; NO512-NEXT: xorq 64(%rsi), %rdi ; NO512-NEXT: xorq 96(%rsi), %rdx -; NO512-NEXT: orq %r8, %rdx -; NO512-NEXT: orq %rdi, %rdx ; NO512-NEXT: orq %rax, %rdx +; NO512-NEXT: orq %rdi, %rdx ; NO512-NEXT: orq %r9, %rdx -; NO512-NEXT: xorl %eax, %eax ; NO512-NEXT: orq %rcx, %rdx +; NO512-NEXT: xorl %eax, %eax +; NO512-NEXT: orq %r8, %rdx ; NO512-NEXT: sete %al ; NO512-NEXT: retq ; @@ -1178,18 +1178,18 @@ define i1 @eq_i512_args(i512 %a, i512 %b) { ; ANY-LABEL: eq_i512_args: ; ANY: # %bb.0: -; ANY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; ANY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax +; ANY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; ANY-NEXT: orq %rax, %rcx +; ANY-NEXT: orq %r10, %rcx ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9 ; ANY-NEXT: orq %rcx, %r9 ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi ; ANY-NEXT: orq %r9, %rsi -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx -; ANY-NEXT: orq %r10, %rdx +; ANY-NEXT: orq %rax, %rdx ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8 ; ANY-NEXT: orq %rdx, %r8 ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi @@ -1305,24 +1305,24 @@ define i1 @eq_i512_load_arg(ptr%p, i512 %b) { ; ANY-LABEL: eq_i512_load_arg: ; ANY: # %bb.0: -; ANY-NEXT: movq 40(%rdi), %r10 -; ANY-NEXT: movq 48(%rdi), %rax +; ANY-NEXT: movq 40(%rdi), %rax +; ANY-NEXT: movq 48(%rdi), %r10 ; ANY-NEXT: movq 56(%rdi), %r11 ; ANY-NEXT: xorq 24(%rdi), %r8 ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r11 ; ANY-NEXT: orq %r8, %r11 ; ANY-NEXT: xorq 8(%rdi), %rdx -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 -; ANY-NEXT: orq %r11, %r10 -; ANY-NEXT: orq %rdx, %r10 +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax +; ANY-NEXT: orq %r11, %rax +; ANY-NEXT: orq %rdx, %rax ; ANY-NEXT: xorq 32(%rdi), %r9 ; ANY-NEXT: xorq (%rdi), %rsi ; ANY-NEXT: xorq 16(%rdi), %rcx -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax -; ANY-NEXT: orq %rcx, %rax -; ANY-NEXT: orq %r9, %rax -; ANY-NEXT: orq %rsi, %rax -; ANY-NEXT: orq %r10, %rax +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 +; ANY-NEXT: orq %rcx, %r10 +; ANY-NEXT: orq %r9, %r10 +; ANY-NEXT: orq %rsi, %r10 +; ANY-NEXT: orq %rax, %r10 ; ANY-NEXT: sete %al ; ANY-NEXT: retq %a = load i512, ptr %p diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -1014,27 +1014,27 @@ ; ; x86_64-LABEL: test_ashr_v2i128: ; x86_64: # %bb.0: # %entry -; x86_64-NEXT: movq %rcx, %r11 +; x86_64-NEXT: movq %rcx, %rax ; x86_64-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; x86_64-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d ; x86_64-NEXT: movl %r9d, %ecx -; x86_64-NEXT: shrdq %cl, %r11, %rdx +; x86_64-NEXT: shrdq %cl, %rax, %rdx ; x86_64-NEXT: movl %r8d, %ecx ; x86_64-NEXT: shrdq %cl, %rsi, %rdi -; x86_64-NEXT: movq %rsi, %rax -; x86_64-NEXT: sarq %cl, %rax +; x86_64-NEXT: movq %rsi, %r11 +; x86_64-NEXT: sarq %cl, %r11 ; x86_64-NEXT: sarq $63, %rsi ; x86_64-NEXT: testb $64, %r8b -; x86_64-NEXT: cmovneq %rax, %rdi -; x86_64-NEXT: cmoveq %rax, %rsi -; x86_64-NEXT: movq %r11, %rax +; x86_64-NEXT: cmovneq %r11, %rdi +; x86_64-NEXT: cmoveq %r11, %rsi +; x86_64-NEXT: movq %rax, %r8 ; x86_64-NEXT: movl %r9d, %ecx -; x86_64-NEXT: sarq %cl, %rax -; x86_64-NEXT: sarq $63, %r11 +; x86_64-NEXT: sarq %cl, %r8 +; x86_64-NEXT: sarq $63, %rax ; x86_64-NEXT: testb $64, %r9b -; x86_64-NEXT: cmovneq %rax, %rdx -; x86_64-NEXT: cmoveq %rax, %r11 -; x86_64-NEXT: movq %r11, 24(%r10) +; x86_64-NEXT: cmovneq %r8, %rdx +; x86_64-NEXT: cmoveq %r8, %rax +; x86_64-NEXT: movq %rax, 24(%r10) ; x86_64-NEXT: movq %rdx, 16(%r10) ; x86_64-NEXT: movq %rsi, 8(%r10) ; x86_64-NEXT: movq %rdi, (%r10) diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -2207,34 +2207,34 @@ ; ; X64-SSE-LABEL: PR34947: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movzwl 16(%rdi), %r8d +; X64-SSE-NEXT: movzwl 16(%rdi), %ecx ; X64-SSE-NEXT: movdqa (%rdi), %xmm3 ; X64-SSE-NEXT: movdqa (%rsi), %xmm0 ; X64-SSE-NEXT: movdqa 16(%rsi), %xmm1 ; X64-SSE-NEXT: pxor %xmm4, %xmm4 ; X64-SSE-NEXT: movdqa %xmm3, %xmm2 ; X64-SSE-NEXT: pextrw $7, %xmm3, %eax -; X64-SSE-NEXT: pextrw $4, %xmm3, %r9d -; X64-SSE-NEXT: pextrw $0, %xmm3, %r10d -; X64-SSE-NEXT: pextrw $1, %xmm3, %r11d -; X64-SSE-NEXT: pextrw $3, %xmm3, %ecx +; X64-SSE-NEXT: pextrw $4, %xmm3, %edi +; X64-SSE-NEXT: pextrw $0, %xmm3, %r8d +; X64-SSE-NEXT: pextrw $1, %xmm3, %r9d +; X64-SSE-NEXT: pextrw $3, %xmm3, %r10d ; X64-SSE-NEXT: movdqa %xmm3, %xmm5 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; X64-SSE-NEXT: movd %xmm3, %edi +; X64-SSE-NEXT: movd %xmm3, %r11d ; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %edi +; X64-SSE-NEXT: divl %r11d ; X64-SSE-NEXT: movd %edx, %xmm3 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; X64-SSE-NEXT: movd %xmm4, %eax ; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; X64-SSE-NEXT: movd %xmm4, %edi +; X64-SSE-NEXT: movd %xmm4, %r11d ; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %edi +; X64-SSE-NEXT: divl %r11d ; X64-SSE-NEXT: movd %edx, %xmm4 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; X64-SSE-NEXT: movl %r9d, %eax +; X64-SSE-NEXT: movl %edi, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 16(%rsi) ; X64-SSE-NEXT: movd %edx, %xmm3 @@ -2247,33 +2247,33 @@ ; X64-SSE-NEXT: movd %edx, %xmm1 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; X64-SSE-NEXT: movl %r10d, %eax +; X64-SSE-NEXT: movl %r8d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl (%rsi) ; X64-SSE-NEXT: movd %edx, %xmm1 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X64-SSE-NEXT: movd %xmm2, %edi -; X64-SSE-NEXT: movl %r11d, %eax +; X64-SSE-NEXT: movl %r9d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %edi ; X64-SSE-NEXT: movd %edx, %xmm2 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X64-SSE-NEXT: movd %xmm2, %edi -; X64-SSE-NEXT: movl %ecx, %eax +; X64-SSE-NEXT: movl %r10d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %edi ; X64-SSE-NEXT: movd %edx, %xmm2 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] ; X64-SSE-NEXT: movd %xmm4, %eax ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; X64-SSE-NEXT: movd %xmm0, %ecx +; X64-SSE-NEXT: movd %xmm0, %edi ; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: divl %edi ; X64-SSE-NEXT: movd %edx, %xmm0 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-SSE-NEXT: movl %r8d, %eax +; X64-SSE-NEXT: movl %ecx, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 32(%rsi) ; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199] @@ -2305,27 +2305,27 @@ ; X64-AVX1-NEXT: vmovd %xmm2, %eax ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl 32(%rsi) -; X64-AVX1-NEXT: movl %edx, %r8d +; X64-AVX1-NEXT: movl %edx, %ecx ; X64-AVX1-NEXT: vpextrd $3, %xmm1, %eax ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl 28(%rsi) -; X64-AVX1-NEXT: movl %edx, %r9d +; X64-AVX1-NEXT: movl %edx, %edi ; X64-AVX1-NEXT: vpextrd $2, %xmm1, %eax ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl 24(%rsi) -; X64-AVX1-NEXT: movl %edx, %r10d +; X64-AVX1-NEXT: movl %edx, %r8d ; X64-AVX1-NEXT: vpextrd $1, %xmm1, %eax ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl 20(%rsi) -; X64-AVX1-NEXT: movl %edx, %r11d +; X64-AVX1-NEXT: movl %edx, %r9d ; X64-AVX1-NEXT: vmovd %xmm1, %eax ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl 16(%rsi) -; X64-AVX1-NEXT: movl %edx, %ecx +; X64-AVX1-NEXT: movl %edx, %r10d ; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl 12(%rsi) -; X64-AVX1-NEXT: movl %edx, %edi +; X64-AVX1-NEXT: movl %edx, %r11d ; X64-AVX1-NEXT: vpextrd $2, %xmm0, %eax ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl 8(%rsi) @@ -2340,15 +2340,15 @@ ; X64-AVX1-NEXT: vmovd %edx, %xmm0 ; X64-AVX1-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpinsrd $2, %ebx, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $3, %r11d, %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovd %ecx, %xmm2 -; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 -; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 -; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vmovd %r10d, %xmm2 +; X64-AVX1-NEXT: vpinsrd $1, %r9d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $2, %r8d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: imull $8199, %r8d, %eax # imm = 0x2007 +; X64-AVX1-NEXT: imull $8199, %ecx, %eax # imm = 0x2007 ; X64-AVX1-NEXT: movl %eax, (%rax) ; X64-AVX1-NEXT: vmovdqa %xmm1, (%rax) ; X64-AVX1-NEXT: vmovdqa %xmm0, (%rax) diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll --- a/llvm/test/CodeGen/X86/smul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll @@ -841,148 +841,147 @@ ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %r9, %rbp -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r8, %r13 -; X64-NEXT: movq %rcx, %r14 +; X64-NEXT: movq %r9, %r10 +; X64-NEXT: movq %r8, %rbp +; X64-NEXT: movq %rcx, %r12 ; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rsi, %rbx ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; X64-NEXT: andl $1, %ecx ; X64-NEXT: negq %rcx -; X64-NEXT: andl $1, %r14d -; X64-NEXT: negq %r14 -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r13 +; X64-NEXT: andl $1, %r12d +; X64-NEXT: negq %r12 +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movq %rax, %r15 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %rdx, %r11 +; X64-NEXT: addq %rdx, %r15 ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %rbp +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %r10, %r14 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %rax, %r11 +; X64-NEXT: addq %rax, %r15 ; X64-NEXT: adcq %rdx, %r9 -; X64-NEXT: setb %bl -; X64-NEXT: movzbl %bl, %r10d +; X64-NEXT: setb %dil +; X64-NEXT: movzbl %dil, %r10d ; X64-NEXT: addq %rax, %r9 ; X64-NEXT: adcq %rdx, %r10 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r13 +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r15, %rbp +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rsi, %r11 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %rsi, %r12 -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: addq %rbp, %rax +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rbx, %r13 +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: addq %r11, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rdi, %rsi +; X64-NEXT: adcq %rdi, %rbx ; X64-NEXT: setb %dil ; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: addq %rsi, %rax -; X64-NEXT: movzbl %dil, %edx -; X64-NEXT: adcq %rdx, %r15 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: addq %rbx, %rax -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: adcq %r11, %r15 +; X64-NEXT: movzbl %dil, %edx +; X64-NEXT: adcq %rdx, %rsi +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: adcq %r15, %rsi ; X64-NEXT: adcq $0, %r9 ; X64-NEXT: adcq $0, %r10 ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: movq %rax, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r11, %rax ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %r11, %rbp -; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: addq %r11, %rbx +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %r11 -; X64-NEXT: addq %r12, %rbp -; X64-NEXT: adcq %rax, %r11 +; X64-NEXT: adcq $0, %r15 +; X64-NEXT: addq %r13, %rbx +; X64-NEXT: adcq %r11, %r15 ; X64-NEXT: setb %al -; X64-NEXT: addq %rdi, %r11 +; X64-NEXT: addq %r8, %r15 ; X64-NEXT: movzbl %al, %r8d ; X64-NEXT: adcq %rdx, %r8 -; X64-NEXT: addq %r12, %rsi -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %rbp -; X64-NEXT: adcq $0, %r11 +; X64-NEXT: addq %r13, %rdi +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rsi, %rbx +; X64-NEXT: adcq $0, %r15 ; X64-NEXT: adcq $0, %r8 -; X64-NEXT: addq %r9, %r11 +; X64-NEXT: addq %r9, %r15 ; X64-NEXT: adcq %r10, %r8 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rdx, %r15 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rdx, %r11 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: addq %rax, %r15 +; X64-NEXT: addq %rax, %r11 ; X64-NEXT: adcq %rdx, %rdi ; X64-NEXT: setb %r9b ; X64-NEXT: addq %rax, %rdi ; X64-NEXT: movzbl %r9b, %esi ; X64-NEXT: adcq %rdx, %rsi -; X64-NEXT: addq %rax, %r11 -; X64-NEXT: adcq %r8, %r15 +; X64-NEXT: addq %rax, %r15 +; X64-NEXT: adcq %r8, %r11 ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %rdi ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: movq %rsi, %r8 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: addq %rax, %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: addq %r14, %r8 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: addq %rbx, %r8 -; X64-NEXT: adcq %rax, %r10 -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: addq %r9, %r8 +; X64-NEXT: adcq %r14, %r10 ; X64-NEXT: setb %al ; X64-NEXT: addq %rsi, %r10 ; X64-NEXT: movzbl %al, %esi ; X64-NEXT: adcq %rdx, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: imulq %r14, %rax -; X64-NEXT: addq %r9, %rax -; X64-NEXT: imulq %r14, %r13 -; X64-NEXT: addq %rax, %r13 -; X64-NEXT: movq %r14, %rax +; X64-NEXT: imulq %r12, %rax +; X64-NEXT: addq %r14, %rax +; X64-NEXT: imulq %r12, %rbp +; X64-NEXT: addq %rax, %rbp +; X64-NEXT: movq %r12, %rax ; X64-NEXT: imulq %rcx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rbx, %r14 -; X64-NEXT: addq %rax, %r14 -; X64-NEXT: adcq %rdx, %r13 -; X64-NEXT: addq %r10, %r14 -; X64-NEXT: adcq %rsi, %r13 -; X64-NEXT: movq %r12, %rbx +; X64-NEXT: movq %r9, %r12 +; X64-NEXT: addq %rax, %r12 +; X64-NEXT: adcq %rdx, %rbp +; X64-NEXT: addq %r10, %r12 +; X64-NEXT: adcq %rsi, %rbp +; X64-NEXT: movq %r13, %r14 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: addq %rsi, %rbx +; X64-NEXT: addq %rsi, %r14 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: addq %rdx, %rbx +; X64-NEXT: addq %rdx, %r14 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; X64-NEXT: adcq %r9, %rsi ; X64-NEXT: setb %r10b @@ -994,35 +993,35 @@ ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload ; X64-NEXT: imulq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: addq %r9, %rcx -; X64-NEXT: addq %r12, %rax +; X64-NEXT: addq %r13, %rax ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: addq %rsi, %rax ; X64-NEXT: adcq %r10, %rcx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload -; X64-NEXT: adcq %r8, %rbx -; X64-NEXT: adcq %r14, %rax -; X64-NEXT: adcq %r13, %rcx -; X64-NEXT: addq %r11, %r12 -; X64-NEXT: adcq %r15, %rbx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: adcq %r8, %r14 +; X64-NEXT: adcq %r12, %rax +; X64-NEXT: adcq %rbp, %rcx +; X64-NEXT: addq %r15, %r13 +; X64-NEXT: adcq %r11, %r14 ; X64-NEXT: adcq %rdi, %rax ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: movq %rbp, %rdx +; X64-NEXT: movq %rbx, %rdx ; X64-NEXT: sarq $63, %rdx ; X64-NEXT: xorq %rdx, %rcx -; X64-NEXT: xorq %rdx, %rbx -; X64-NEXT: orq %rcx, %rbx +; X64-NEXT: xorq %rdx, %r14 +; X64-NEXT: orq %rcx, %r14 ; X64-NEXT: xorq %rdx, %rax -; X64-NEXT: orq %rbx, %rax -; X64-NEXT: xorq %r12, %rdx +; X64-NEXT: orq %r14, %rax +; X64-NEXT: xorq %r13, %rdx ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movl %eax, %esi ; X64-NEXT: andl $1, %esi ; X64-NEXT: movq %rsi, %rcx ; X64-NEXT: negq %rcx -; X64-NEXT: xorq %rcx, %rbp +; X64-NEXT: xorq %rcx, %rbx ; X64-NEXT: xorq %rax, %rcx -; X64-NEXT: orq %rbp, %rcx +; X64-NEXT: orq %rbx, %rcx ; X64-NEXT: orq %rdx, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll @@ -9,71 +9,67 @@ ; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: pushq %r14 ; X64-NEXT: .cfi_def_cfa_offset 24 -; X64-NEXT: pushq %r12 -; X64-NEXT: .cfi_def_cfa_offset 32 ; X64-NEXT: pushq %rbx -; X64-NEXT: .cfi_def_cfa_offset 40 -; X64-NEXT: .cfi_offset %rbx, -40 -; X64-NEXT: .cfi_offset %r12, -32 +; X64-NEXT: .cfi_def_cfa_offset 32 +; X64-NEXT: .cfi_offset %rbx, -32 ; X64-NEXT: .cfi_offset %r14, -24 ; X64-NEXT: .cfi_offset %r15, -16 -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rsi, %r10 -; X64-NEXT: movq %rdi, %r15 -; X64-NEXT: sarq $63, %rsi +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rdi, %r10 +; X64-NEXT: movq %rsi, %r14 +; X64-NEXT: sarq $63, %r14 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: imulq %rsi, %rdi +; X64-NEXT: imulq %r14, %rdi ; X64-NEXT: movq %rdx, %rax -; X64-NEXT: mulq %rsi +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: imulq %rcx, %rsi -; X64-NEXT: addq %rdx, %rsi +; X64-NEXT: imulq %rcx, %r14 +; X64-NEXT: addq %rdx, %r14 ; X64-NEXT: movq %rcx, %rdi ; X64-NEXT: sarq $63, %rdi -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: imulq %r10, %rbx +; X64-NEXT: movq %rdi, %r15 +; X64-NEXT: imulq %rsi, %r15 ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rbx, %rdx -; X64-NEXT: imulq %r15, %rdi +; X64-NEXT: addq %r15, %rdx +; X64-NEXT: imulq %r10, %rdi ; X64-NEXT: addq %rdx, %rdi ; X64-NEXT: addq %r9, %r11 -; X64-NEXT: adcq %rsi, %rdi -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: adcq %r14, %rdi +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r14, %rsi +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r14, %r15 ; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rsi, %r14 -; X64-NEXT: adcq %rbx, %r15 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %r15, %r10 +; X64-NEXT: adcq %rbx, %r14 ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %esi -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movzbl %al, %ebx +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: addq %r15, %rax -; X64-NEXT: adcq %rsi, %rdx +; X64-NEXT: addq %r14, %rax +; X64-NEXT: adcq %rbx, %rdx ; X64-NEXT: addq %r11, %rax ; X64-NEXT: adcq %rdi, %rdx -; X64-NEXT: movq %r14, 8(%r8) -; X64-NEXT: sarq $63, %r14 -; X64-NEXT: xorq %r14, %rdx -; X64-NEXT: xorq %rax, %r14 -; X64-NEXT: orq %rdx, %r14 +; X64-NEXT: movq %r10, 8(%r8) +; X64-NEXT: sarq $63, %r10 +; X64-NEXT: xorq %r10, %rdx +; X64-NEXT: xorq %rax, %r10 +; X64-NEXT: orq %rdx, %r10 ; X64-NEXT: setne %al ; X64-NEXT: movq %r9, (%r8) ; X64-NEXT: popq %rbx -; X64-NEXT: popq %r12 ; X64-NEXT: popq %r14 ; X64-NEXT: popq %r15 ; X64-NEXT: retq @@ -364,228 +360,232 @@ ; X64-NEXT: .cfi_offset %r14, -32 ; X64-NEXT: .cfi_offset %r15, -24 ; X64-NEXT: .cfi_offset %rbp, -16 +; X64-NEXT: movq %rcx, %r11 ; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rsi, %r14 +; X64-NEXT: movq %rsi, %r15 ; X64-NEXT: movq %rdx, %rax ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r10, %rbp -; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rsi, %r10 +; X64-NEXT: adcq $0, %rcx ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rbp, %r12 -; X64-NEXT: adcq %rsi, %rbx +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r10, %r14 +; X64-NEXT: adcq %rcx, %r12 ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %esi -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movzbl %al, %ecx +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rbx, %r10 -; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r12, %rbx +; X64-NEXT: adcq %rcx, %r11 ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %r8, %rcx +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rsi, %rbx -; X64-NEXT: adcq $0, %rbp +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r8, %r13 +; X64-NEXT: adcq $0, %r12 ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %r13 +; X64-NEXT: movq %r9, %rsi +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: addq %r13, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %rbp, %rdi -; X64-NEXT: setb %bl -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %rdi, %rsi -; X64-NEXT: movzbl %bl, %eax -; X64-NEXT: adcq %rax, %r9 +; X64-NEXT: adcq %r12, %r10 +; X64-NEXT: setb %cl +; X64-NEXT: movq %r15, %r9 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %r10, %r8 +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: adcq %rax, %rbp ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; X64-NEXT: addq %r11, %rsi -; X64-NEXT: adcq %r12, %r9 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r13, %rbx -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r13, %rax +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Folded Reload +; X64-NEXT: adcq %r14, %rbp +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: adcq $0, %r11 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r9, %rsi +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rdi, %rcx -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq %rbp, %rdi -; X64-NEXT: setb %r11b -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rdi, %rbp -; X64-NEXT: movzbl %r11b, %eax -; X64-NEXT: adcq %rax, %rcx -; X64-NEXT: addq %rsi, %r12 -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r9, %rbx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: adcq $0, %rcx -; X64-NEXT: addq %r10, %rbp -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload -; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %r10, %r9 +; X64-NEXT: adcq $0, %r13 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: addq %r9, %rax +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: adcq %r13, %r11 +; X64-NEXT: setb %cl ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r11, %r13 +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: adcq %rax, %r10 +; X64-NEXT: addq %r8, %r14 +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %rbp, %rdi +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq $0, %r13 +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: addq %rbx, %r13 +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Folded Reload +; X64-NEXT: setb %cl +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r10, %rbx -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: addq %rbx, %rax -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: adcq %rdi, %rsi -; X64-NEXT: setb %dil -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rsi, %r11 -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %rbp, %r9 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %rcx, %rbx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 1-byte Folded Reload -; X64-NEXT: adcq %rax, %r11 -; X64-NEXT: adcq $0, %r12 -; X64-NEXT: movq %r14, %rsi -; X64-NEXT: sarq $63, %rsi -; X64-NEXT: movq %rsi, %rdi -; X64-NEXT: imulq %r13, %rdi -; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload +; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: imulq %rsi, %r15 -; X64-NEXT: addq %rdx, %r15 -; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp ## 8-byte Reload -; X64-NEXT: imulq %rbp, %rbx -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: addq %rdx, %rbx -; X64-NEXT: imulq %rsi, %r8 -; X64-NEXT: addq %rbx, %r8 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: addq %rax, %rcx -; X64-NEXT: adcq %r15, %r8 -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rsi, %r8 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: addq %r8, %rax +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: adcq %rdi, %r9 +; X64-NEXT: setb %r8b +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r9, %r14 +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %rbp +; X64-NEXT: addq %r13, %r11 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %r10, %rsi +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: adcq %rax, %r14 ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: addq %rbx, %r15 -; X64-NEXT: adcq %rdi, %rbp -; X64-NEXT: setb %bl -; X64-NEXT: addq %rax, %rbp -; X64-NEXT: movzbl %bl, %r9d -; X64-NEXT: adcq %rdx, %r9 -; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: adcq %r8, %r9 +; X64-NEXT: movq %rbx, %r13 +; X64-NEXT: movq %rbx, %r10 ; X64-NEXT: sarq $63, %r13 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload -; X64-NEXT: movq %rax, %rsi +; X64-NEXT: movq %r13, %rcx +; X64-NEXT: imulq %r12, %rcx +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: imulq %r13, %r15 +; X64-NEXT: addq %rdx, %r15 +; X64-NEXT: movq %r13, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload +; X64-NEXT: imulq %rdi, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: addq %rdx, %rcx ; X64-NEXT: imulq %r13, %rsi +; X64-NEXT: addq %rcx, %rsi +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: addq %rax, %r8 +; X64-NEXT: adcq %r15, %rsi +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: addq %rdx, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload -; X64-NEXT: movq %r10, %rcx -; X64-NEXT: imulq %r13, %rcx -; X64-NEXT: addq %rsi, %rcx +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r9, %r15 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: adcq $0, %r13 +; X64-NEXT: addq %rcx, %r15 +; X64-NEXT: adcq %r9, %r13 +; X64-NEXT: setb %cl +; X64-NEXT: addq %rax, %r13 +; X64-NEXT: movzbl %cl, %r9d +; X64-NEXT: adcq %rdx, %r9 +; X64-NEXT: addq %r8, %r13 +; X64-NEXT: adcq %rsi, %r9 +; X64-NEXT: sarq $63, %r12 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: imulq %r12, %r8 +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: imulq %r13, %rsi -; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: addq %rdx, %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: imulq %r12, %rbx +; X64-NEXT: addq %r8, %rbx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: imulq %r12, %rcx +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: movq %r14, %rax -; X64-NEXT: imulq %r13, %rax -; X64-NEXT: addq %rdx, %rax -; X64-NEXT: addq %rdi, %r8 -; X64-NEXT: adcq %rcx, %rax -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: addq %rbx, %rsi -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: addq %rax, %rsi -; X64-NEXT: adcq %rdx, %rbx -; X64-NEXT: setb %cl +; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: imulq %r12, %r10 +; X64-NEXT: addq %rdx, %r10 +; X64-NEXT: addq %rsi, %r8 +; X64-NEXT: adcq %rbx, %r10 +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: addq %r11, %rbx +; X64-NEXT: adcq $0, %r11 +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: addq %rax, %rbx +; X64-NEXT: adcq %rdx, %r11 +; X64-NEXT: setb %cl +; X64-NEXT: addq %rax, %r11 ; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: adcq %rdx, %rax -; X64-NEXT: addq %r8, %rbx -; X64-NEXT: adcq %r14, %rax -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Folded Reload -; X64-NEXT: adcq %r15, %rsi -; X64-NEXT: adcq %rbp, %rbx +; X64-NEXT: addq %r8, %r11 +; X64-NEXT: adcq %r10, %rax +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload +; X64-NEXT: adcq %r15, %rbx +; X64-NEXT: adcq %r13, %r11 ; X64-NEXT: adcq %r9, %rax -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload -; X64-NEXT: adcq %r11, %rbx -; X64-NEXT: adcq %r12, %rax +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Folded Reload +; X64-NEXT: adcq %r14, %r11 +; X64-NEXT: adcq %rbp, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: sarq $63, %rcx ; X64-NEXT: xorq %rcx, %rax -; X64-NEXT: xorq %rcx, %rsi -; X64-NEXT: orq %rax, %rsi ; X64-NEXT: xorq %rcx, %rbx -; X64-NEXT: xorq %rdi, %rcx +; X64-NEXT: orq %rax, %rbx +; X64-NEXT: xorq %rcx, %r11 +; X64-NEXT: xorq %rsi, %rcx +; X64-NEXT: orq %r11, %rcx ; X64-NEXT: orq %rbx, %rcx -; X64-NEXT: orq %rsi, %rcx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax ; X64-NEXT: movq %rdx, 24(%rax) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll b/llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll --- a/llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll +++ b/llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll @@ -148,12 +148,12 @@ ; X64-NOPIC-NEXT: sarq $63, %rax ; X64-NOPIC-NEXT: shlq $47, %rax ; X64-NOPIC-NEXT: orq %rax, %rsp -; X64-NOPIC-NEXT: movq $.Lslh_ret_addr2, %rbp +; X64-NOPIC-NEXT: movq $.Lslh_ret_addr2, %r15 ; X64-NOPIC-NEXT: callq f@PLT ; X64-NOPIC-NEXT: .Lslh_ret_addr2: ; X64-NOPIC-NEXT: movq %rsp, %rax ; X64-NOPIC-NEXT: sarq $63, %rax -; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr2, %rbp +; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr2, %r15 ; X64-NOPIC-NEXT: cmovneq %r14, %rax ; X64-NOPIC-NEXT: movl (%rbx), %ebp ; X64-NOPIC-NEXT: shlq $47, %rax @@ -190,13 +190,13 @@ ; X64-NOPIC-MCM-NEXT: sarq $63, %rax ; X64-NOPIC-MCM-NEXT: shlq $47, %rax ; X64-NOPIC-MCM-NEXT: orq %rax, %rsp -; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr2(%rip), %rbp +; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr2(%rip), %r15 ; X64-NOPIC-MCM-NEXT: callq f@PLT ; X64-NOPIC-MCM-NEXT: .Lslh_ret_addr2: ; X64-NOPIC-MCM-NEXT: movq %rsp, %rax ; X64-NOPIC-MCM-NEXT: sarq $63, %rax ; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr2(%rip), %rcx -; X64-NOPIC-MCM-NEXT: cmpq %rcx, %rbp +; X64-NOPIC-MCM-NEXT: cmpq %rcx, %r15 ; X64-NOPIC-MCM-NEXT: cmovneq %r14, %rax ; X64-NOPIC-MCM-NEXT: movl (%rbx), %ebp ; X64-NOPIC-MCM-NEXT: shlq $47, %rax @@ -234,13 +234,13 @@ ; X64-PIC-NEXT: sarq $63, %rax ; X64-PIC-NEXT: shlq $47, %rax ; X64-PIC-NEXT: orq %rax, %rsp -; X64-PIC-NEXT: leaq .Lslh_ret_addr2(%rip), %rbp +; X64-PIC-NEXT: leaq .Lslh_ret_addr2(%rip), %r15 ; X64-PIC-NEXT: callq f@PLT ; X64-PIC-NEXT: .Lslh_ret_addr2: ; X64-PIC-NEXT: movq %rsp, %rax ; X64-PIC-NEXT: sarq $63, %rax ; X64-PIC-NEXT: leaq .Lslh_ret_addr2(%rip), %rcx -; X64-PIC-NEXT: cmpq %rcx, %rbp +; X64-PIC-NEXT: cmpq %rcx, %r15 ; X64-PIC-NEXT: cmovneq %r14, %rax ; X64-PIC-NEXT: movl (%rbx), %ebp ; X64-PIC-NEXT: shlq $47, %rax @@ -295,18 +295,18 @@ ; X64-NOPIC-NEXT: shlq $47, %rax ; X64-NOPIC-NEXT: movq %r14, %rdi ; X64-NOPIC-NEXT: orq %rax, %rsp -; X64-NOPIC-NEXT: movq $.Lslh_ret_addr4, %rbp +; X64-NOPIC-NEXT: movq $.Lslh_ret_addr4, %r12 ; X64-NOPIC-NEXT: callq setjmp@PLT ; X64-NOPIC-NEXT: .Lslh_ret_addr4: ; X64-NOPIC-NEXT: movq %rsp, %rax ; X64-NOPIC-NEXT: sarq $63, %rax -; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr4, %rbp +; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr4, %r12 ; X64-NOPIC-NEXT: cmovneq %r15, %rax -; X64-NOPIC-NEXT: movl (%rbx), %ebp -; X64-NOPIC-NEXT: movl $42, %r12d +; X64-NOPIC-NEXT: movl (%rbx), %r12d +; X64-NOPIC-NEXT: movl $42, %ebp ; X64-NOPIC-NEXT: shlq $47, %rax ; X64-NOPIC-NEXT: movq %r14, %rdi -; X64-NOPIC-NEXT: movl %r12d, %esi +; X64-NOPIC-NEXT: movl %ebp, %esi ; X64-NOPIC-NEXT: orq %rax, %rsp ; X64-NOPIC-NEXT: movq $.Lslh_ret_addr5, %r13 ; X64-NOPIC-NEXT: callq sigsetjmp@PLT @@ -315,11 +315,11 @@ ; X64-NOPIC-NEXT: sarq $63, %rax ; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr5, %r13 ; X64-NOPIC-NEXT: cmovneq %r15, %rax -; X64-NOPIC-NEXT: addl (%rbx), %ebp +; X64-NOPIC-NEXT: addl (%rbx), %r12d ; X64-NOPIC-NEXT: shlq $47, %rax ; X64-NOPIC-NEXT: movq %r14, %rdi ; X64-NOPIC-NEXT: movq %r14, %rsi -; X64-NOPIC-NEXT: movl %r12d, %edx +; X64-NOPIC-NEXT: movl %ebp, %edx ; X64-NOPIC-NEXT: orq %rax, %rsp ; X64-NOPIC-NEXT: movq $.Lslh_ret_addr6, %r14 ; X64-NOPIC-NEXT: callq __sigsetjmp@PLT @@ -329,8 +329,8 @@ ; X64-NOPIC-NEXT: cmpq $.Lslh_ret_addr6, %r14 ; X64-NOPIC-NEXT: movq %rax, %rcx ; X64-NOPIC-NEXT: cmovneq %r15, %rcx -; X64-NOPIC-NEXT: addl (%rbx), %ebp -; X64-NOPIC-NEXT: movl %ebp, %eax +; X64-NOPIC-NEXT: addl (%rbx), %r12d +; X64-NOPIC-NEXT: movl %r12d, %eax ; X64-NOPIC-NEXT: orl %ecx, %eax ; X64-NOPIC-NEXT: shlq $47, %rcx ; X64-NOPIC-NEXT: orq %rcx, %rsp @@ -360,19 +360,19 @@ ; X64-NOPIC-MCM-NEXT: shlq $47, %rax ; X64-NOPIC-MCM-NEXT: movq %r14, %rdi ; X64-NOPIC-MCM-NEXT: orq %rax, %rsp -; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr4(%rip), %rbp +; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr4(%rip), %r12 ; X64-NOPIC-MCM-NEXT: callq setjmp@PLT ; X64-NOPIC-MCM-NEXT: .Lslh_ret_addr4: ; X64-NOPIC-MCM-NEXT: movq %rsp, %rax ; X64-NOPIC-MCM-NEXT: sarq $63, %rax ; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr4(%rip), %rcx -; X64-NOPIC-MCM-NEXT: cmpq %rcx, %rbp +; X64-NOPIC-MCM-NEXT: cmpq %rcx, %r12 ; X64-NOPIC-MCM-NEXT: cmovneq %r15, %rax -; X64-NOPIC-MCM-NEXT: movl (%rbx), %ebp -; X64-NOPIC-MCM-NEXT: movl $42, %r12d +; X64-NOPIC-MCM-NEXT: movl (%rbx), %r12d +; X64-NOPIC-MCM-NEXT: movl $42, %ebp ; X64-NOPIC-MCM-NEXT: shlq $47, %rax ; X64-NOPIC-MCM-NEXT: movq %r14, %rdi -; X64-NOPIC-MCM-NEXT: movl %r12d, %esi +; X64-NOPIC-MCM-NEXT: movl %ebp, %esi ; X64-NOPIC-MCM-NEXT: orq %rax, %rsp ; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr5(%rip), %r13 ; X64-NOPIC-MCM-NEXT: callq sigsetjmp@PLT @@ -382,11 +382,11 @@ ; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr5(%rip), %rcx ; X64-NOPIC-MCM-NEXT: cmpq %rcx, %r13 ; X64-NOPIC-MCM-NEXT: cmovneq %r15, %rax -; X64-NOPIC-MCM-NEXT: addl (%rbx), %ebp +; X64-NOPIC-MCM-NEXT: addl (%rbx), %r12d ; X64-NOPIC-MCM-NEXT: shlq $47, %rax ; X64-NOPIC-MCM-NEXT: movq %r14, %rdi ; X64-NOPIC-MCM-NEXT: movq %r14, %rsi -; X64-NOPIC-MCM-NEXT: movl %r12d, %edx +; X64-NOPIC-MCM-NEXT: movl %ebp, %edx ; X64-NOPIC-MCM-NEXT: orq %rax, %rsp ; X64-NOPIC-MCM-NEXT: leaq .Lslh_ret_addr6(%rip), %r14 ; X64-NOPIC-MCM-NEXT: callq __sigsetjmp@PLT @@ -397,8 +397,8 @@ ; X64-NOPIC-MCM-NEXT: cmpq %rcx, %r14 ; X64-NOPIC-MCM-NEXT: movq %rax, %rcx ; X64-NOPIC-MCM-NEXT: cmovneq %r15, %rcx -; X64-NOPIC-MCM-NEXT: addl (%rbx), %ebp -; X64-NOPIC-MCM-NEXT: movl %ebp, %eax +; X64-NOPIC-MCM-NEXT: addl (%rbx), %r12d +; X64-NOPIC-MCM-NEXT: movl %r12d, %eax ; X64-NOPIC-MCM-NEXT: orl %ecx, %eax ; X64-NOPIC-MCM-NEXT: shlq $47, %rcx ; X64-NOPIC-MCM-NEXT: orq %rcx, %rsp @@ -428,19 +428,19 @@ ; X64-PIC-NEXT: shlq $47, %rax ; X64-PIC-NEXT: movq %r14, %rdi ; X64-PIC-NEXT: orq %rax, %rsp -; X64-PIC-NEXT: leaq .Lslh_ret_addr4(%rip), %rbp +; X64-PIC-NEXT: leaq .Lslh_ret_addr4(%rip), %r12 ; X64-PIC-NEXT: callq setjmp@PLT ; X64-PIC-NEXT: .Lslh_ret_addr4: ; X64-PIC-NEXT: movq %rsp, %rax ; X64-PIC-NEXT: sarq $63, %rax ; X64-PIC-NEXT: leaq .Lslh_ret_addr4(%rip), %rcx -; X64-PIC-NEXT: cmpq %rcx, %rbp +; X64-PIC-NEXT: cmpq %rcx, %r12 ; X64-PIC-NEXT: cmovneq %r15, %rax -; X64-PIC-NEXT: movl (%rbx), %ebp -; X64-PIC-NEXT: movl $42, %r12d +; X64-PIC-NEXT: movl (%rbx), %r12d +; X64-PIC-NEXT: movl $42, %ebp ; X64-PIC-NEXT: shlq $47, %rax ; X64-PIC-NEXT: movq %r14, %rdi -; X64-PIC-NEXT: movl %r12d, %esi +; X64-PIC-NEXT: movl %ebp, %esi ; X64-PIC-NEXT: orq %rax, %rsp ; X64-PIC-NEXT: leaq .Lslh_ret_addr5(%rip), %r13 ; X64-PIC-NEXT: callq sigsetjmp@PLT @@ -450,11 +450,11 @@ ; X64-PIC-NEXT: leaq .Lslh_ret_addr5(%rip), %rcx ; X64-PIC-NEXT: cmpq %rcx, %r13 ; X64-PIC-NEXT: cmovneq %r15, %rax -; X64-PIC-NEXT: addl (%rbx), %ebp +; X64-PIC-NEXT: addl (%rbx), %r12d ; X64-PIC-NEXT: shlq $47, %rax ; X64-PIC-NEXT: movq %r14, %rdi ; X64-PIC-NEXT: movq %r14, %rsi -; X64-PIC-NEXT: movl %r12d, %edx +; X64-PIC-NEXT: movl %ebp, %edx ; X64-PIC-NEXT: orq %rax, %rsp ; X64-PIC-NEXT: leaq .Lslh_ret_addr6(%rip), %r14 ; X64-PIC-NEXT: callq __sigsetjmp@PLT @@ -465,8 +465,8 @@ ; X64-PIC-NEXT: cmpq %rcx, %r14 ; X64-PIC-NEXT: movq %rax, %rcx ; X64-PIC-NEXT: cmovneq %r15, %rcx -; X64-PIC-NEXT: addl (%rbx), %ebp -; X64-PIC-NEXT: movl %ebp, %eax +; X64-PIC-NEXT: addl (%rbx), %r12d +; X64-PIC-NEXT: movl %r12d, %eax ; X64-PIC-NEXT: orl %ecx, %eax ; X64-PIC-NEXT: shlq $47, %rcx ; X64-PIC-NEXT: orq %rcx, %rsp diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening.ll b/llvm/test/CodeGen/X86/speculative-load-hardening.ll --- a/llvm/test/CodeGen/X86/speculative-load-hardening.ll +++ b/llvm/test/CodeGen/X86/speculative-load-hardening.ll @@ -42,16 +42,16 @@ ; X64-NEXT: .cfi_offset %r14, -24 ; X64-NEXT: .cfi_offset %r15, -16 ; X64-NEXT: movq %rsp, %rax -; X64-NEXT: movq $-1, %rbx +; X64-NEXT: movq $-1, %r14 ; X64-NEXT: sarq $63, %rax ; X64-NEXT: testl %edi, %edi ; X64-NEXT: jne .LBB1_1 ; X64-NEXT: # %bb.2: # %then1 -; X64-NEXT: cmovneq %rbx, %rax +; X64-NEXT: cmovneq %r14, %rax ; X64-NEXT: testl %esi, %esi ; X64-NEXT: je .LBB1_4 ; X64-NEXT: .LBB1_1: -; X64-NEXT: cmoveq %rbx, %rax +; X64-NEXT: cmoveq %r14, %rax ; X64-NEXT: .LBB1_8: # %exit ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp @@ -64,24 +64,24 @@ ; X64-NEXT: retq ; X64-NEXT: .LBB1_4: # %then2 ; X64-NEXT: .cfi_def_cfa_offset 32 -; X64-NEXT: movq %r8, %r14 -; X64-NEXT: cmovneq %rbx, %rax +; X64-NEXT: movq %r8, %rbx +; X64-NEXT: cmovneq %r14, %rax ; X64-NEXT: testl %edx, %edx ; X64-NEXT: je .LBB1_6 ; X64-NEXT: # %bb.5: # %else3 -; X64-NEXT: cmoveq %rbx, %rax +; X64-NEXT: cmoveq %r14, %rax ; X64-NEXT: movslq (%r9), %rcx ; X64-NEXT: orq %rax, %rcx -; X64-NEXT: leaq (%r14,%rcx,4), %r15 -; X64-NEXT: movl %ecx, (%r14,%rcx,4) +; X64-NEXT: leaq (%rbx,%rcx,4), %r15 +; X64-NEXT: movl %ecx, (%rbx,%rcx,4) ; X64-NEXT: jmp .LBB1_7 ; X64-NEXT: .LBB1_6: # %then3 -; X64-NEXT: cmovneq %rbx, %rax +; X64-NEXT: cmovneq %r14, %rax ; X64-NEXT: movl (%rcx), %ecx -; X64-NEXT: addl (%r14), %ecx +; X64-NEXT: addl (%rbx), %ecx ; X64-NEXT: movslq %ecx, %rdi ; X64-NEXT: orq %rax, %rdi -; X64-NEXT: movl (%r14,%rdi,4), %esi +; X64-NEXT: movl (%rbx,%rdi,4), %esi ; X64-NEXT: orl %eax, %esi ; X64-NEXT: movq (%r9), %r15 ; X64-NEXT: orq %rax, %r15 @@ -95,11 +95,11 @@ ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx ; X64-NEXT: sarq $63, %rax ; X64-NEXT: cmpq $.Lslh_ret_addr0, %rcx -; X64-NEXT: cmovneq %rbx, %rax +; X64-NEXT: cmovneq %r14, %rax ; X64-NEXT: .LBB1_7: # %merge ; X64-NEXT: movslq (%r15), %rcx ; X64-NEXT: orq %rax, %rcx -; X64-NEXT: movl $0, (%r14,%rcx,4) +; X64-NEXT: movl $0, (%rbx,%rcx,4) ; X64-NEXT: jmp .LBB1_8 ; ; X64-LFENCE-LABEL: test_basic_conditions: @@ -210,18 +210,18 @@ ; X64-NEXT: cmoveq %r15, %rax ; X64-NEXT: jmp .LBB2_5 ; X64-NEXT: .LBB2_2: # %l.header.preheader -; X64-NEXT: movq %rcx, %r14 -; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rcx, %rbx +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movl %esi, %ebp ; X64-NEXT: cmovneq %r15, %rax -; X64-NEXT: xorl %ebx, %ebx +; X64-NEXT: xorl %r12d, %r12d ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB2_3: # %l.header ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movslq (%r12), %rcx +; X64-NEXT: movslq (%r14), %rcx ; X64-NEXT: orq %rax, %rcx ; X64-NEXT: movq %rax, %rdx -; X64-NEXT: orq %r14, %rdx +; X64-NEXT: orq %rbx, %rdx ; X64-NEXT: movl (%rdx,%rcx,4), %edi ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp @@ -232,8 +232,8 @@ ; X64-NEXT: sarq $63, %rax ; X64-NEXT: cmpq $.Lslh_ret_addr1, %rcx ; X64-NEXT: cmovneq %r15, %rax -; X64-NEXT: incl %ebx -; X64-NEXT: cmpl %ebp, %ebx +; X64-NEXT: incl %r12d +; X64-NEXT: cmpl %ebp, %r12d ; X64-NEXT: jge .LBB2_4 ; X64-NEXT: # %bb.6: # in Loop: Header=BB2_3 Depth=1 ; X64-NEXT: cmovgeq %r15, %rax @@ -260,20 +260,20 @@ ; X64-LFENCE-NEXT: testl %edi, %edi ; X64-LFENCE-NEXT: jne .LBB2_3 ; X64-LFENCE-NEXT: # %bb.1: # %l.header.preheader -; X64-LFENCE-NEXT: movq %rcx, %r14 -; X64-LFENCE-NEXT: movq %rdx, %r15 +; X64-LFENCE-NEXT: movq %rcx, %rbx +; X64-LFENCE-NEXT: movq %rdx, %r14 ; X64-LFENCE-NEXT: movl %esi, %ebp ; X64-LFENCE-NEXT: lfence -; X64-LFENCE-NEXT: xorl %ebx, %ebx +; X64-LFENCE-NEXT: xorl %r15d, %r15d ; X64-LFENCE-NEXT: .p2align 4, 0x90 ; X64-LFENCE-NEXT: .LBB2_2: # %l.header ; X64-LFENCE-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-LFENCE-NEXT: lfence -; X64-LFENCE-NEXT: movslq (%r15), %rax -; X64-LFENCE-NEXT: movl (%r14,%rax,4), %edi +; X64-LFENCE-NEXT: movslq (%r14), %rax +; X64-LFENCE-NEXT: movl (%rbx,%rax,4), %edi ; X64-LFENCE-NEXT: callq sink@PLT -; X64-LFENCE-NEXT: incl %ebx -; X64-LFENCE-NEXT: cmpl %ebp, %ebx +; X64-LFENCE-NEXT: incl %r15d +; X64-LFENCE-NEXT: cmpl %ebp, %r15d ; X64-LFENCE-NEXT: jl .LBB2_2 ; X64-LFENCE-NEXT: .LBB2_3: # %exit ; X64-LFENCE-NEXT: lfence @@ -312,34 +312,34 @@ ; X64-NEXT: pushq %rbx ; X64-NEXT: pushq %rax ; X64-NEXT: movq %rsp, %rax -; X64-NEXT: movq $-1, %rbp +; X64-NEXT: movq $-1, %r12 ; X64-NEXT: sarq $63, %rax ; X64-NEXT: testl %edi, %edi ; X64-NEXT: je .LBB3_2 ; X64-NEXT: # %bb.1: -; X64-NEXT: cmoveq %rbp, %rax +; X64-NEXT: cmoveq %r12, %rax ; X64-NEXT: jmp .LBB3_10 ; X64-NEXT: .LBB3_2: # %l1.header.preheader -; X64-NEXT: movq %r8, %r14 -; X64-NEXT: movq %rcx, %rbx -; X64-NEXT: movl %edx, %r12d +; X64-NEXT: movq %r8, %rbx +; X64-NEXT: movq %rcx, %r14 +; X64-NEXT: movl %edx, %ebp ; X64-NEXT: movl %esi, %r15d -; X64-NEXT: cmovneq %rbp, %rax +; X64-NEXT: cmovneq %r12, %rax ; X64-NEXT: xorl %r13d, %r13d ; X64-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: testl %r15d, %r15d ; X64-NEXT: jle .LBB3_4 ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB3_5: # %l2.header.preheader -; X64-NEXT: cmovleq %rbp, %rax +; X64-NEXT: cmovleq %r12, %rax ; X64-NEXT: xorl %r15d, %r15d ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB3_6: # %l2.header ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movslq (%rbx), %rcx +; X64-NEXT: movslq (%r14), %rcx ; X64-NEXT: orq %rax, %rcx ; X64-NEXT: movq %rax, %rdx -; X64-NEXT: orq %r14, %rdx +; X64-NEXT: orq %rbx, %rdx ; X64-NEXT: movl (%rdx,%rcx,4), %edi ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp @@ -349,26 +349,26 @@ ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx ; X64-NEXT: sarq $63, %rax ; X64-NEXT: cmpq $.Lslh_ret_addr2, %rcx -; X64-NEXT: cmovneq %rbp, %rax +; X64-NEXT: cmovneq %r12, %rax ; X64-NEXT: incl %r15d -; X64-NEXT: cmpl %r12d, %r15d +; X64-NEXT: cmpl %ebp, %r15d ; X64-NEXT: jge .LBB3_7 ; X64-NEXT: # %bb.11: # in Loop: Header=BB3_6 Depth=1 -; X64-NEXT: cmovgeq %rbp, %rax +; X64-NEXT: cmovgeq %r12, %rax ; X64-NEXT: jmp .LBB3_6 ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB3_7: -; X64-NEXT: cmovlq %rbp, %rax +; X64-NEXT: cmovlq %r12, %rax ; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Reload ; X64-NEXT: jmp .LBB3_8 ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB3_4: -; X64-NEXT: cmovgq %rbp, %rax +; X64-NEXT: cmovgq %r12, %rax ; X64-NEXT: .LBB3_8: # %l1.latch -; X64-NEXT: movslq (%rbx), %rcx +; X64-NEXT: movslq (%r14), %rcx ; X64-NEXT: orq %rax, %rcx ; X64-NEXT: movq %rax, %rdx -; X64-NEXT: orq %r14, %rdx +; X64-NEXT: orq %rbx, %rdx ; X64-NEXT: movl (%rdx,%rcx,4), %edi ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp @@ -378,17 +378,17 @@ ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx ; X64-NEXT: sarq $63, %rax ; X64-NEXT: cmpq $.Lslh_ret_addr3, %rcx -; X64-NEXT: cmovneq %rbp, %rax +; X64-NEXT: cmovneq %r12, %rax ; X64-NEXT: incl %r13d ; X64-NEXT: cmpl %r15d, %r13d ; X64-NEXT: jge .LBB3_9 ; X64-NEXT: # %bb.12: -; X64-NEXT: cmovgeq %rbp, %rax +; X64-NEXT: cmovgeq %r12, %rax ; X64-NEXT: testl %r15d, %r15d ; X64-NEXT: jg .LBB3_5 ; X64-NEXT: jmp .LBB3_4 ; X64-NEXT: .LBB3_9: -; X64-NEXT: cmovlq %rbp, %rax +; X64-NEXT: cmovlq %r12, %rax ; X64-NEXT: .LBB3_10: # %exit ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp @@ -423,9 +423,9 @@ ; X64-LFENCE-NEXT: popq %rbp ; X64-LFENCE-NEXT: retq ; X64-LFENCE-NEXT: .LBB3_1: # %l1.header.preheader -; X64-LFENCE-NEXT: movq %r8, %r14 -; X64-LFENCE-NEXT: movq %rcx, %rbx -; X64-LFENCE-NEXT: movl %edx, %r13d +; X64-LFENCE-NEXT: movq %r8, %rbx +; X64-LFENCE-NEXT: movq %rcx, %r14 +; X64-LFENCE-NEXT: movl %edx, %ebp ; X64-LFENCE-NEXT: movl %esi, %r15d ; X64-LFENCE-NEXT: lfence ; X64-LFENCE-NEXT: xorl %r12d, %r12d @@ -434,8 +434,8 @@ ; X64-LFENCE-NEXT: .LBB3_5: # %l1.latch ; X64-LFENCE-NEXT: # in Loop: Header=BB3_2 Depth=1 ; X64-LFENCE-NEXT: lfence -; X64-LFENCE-NEXT: movslq (%rbx), %rax -; X64-LFENCE-NEXT: movl (%r14,%rax,4), %edi +; X64-LFENCE-NEXT: movslq (%r14), %rax +; X64-LFENCE-NEXT: movl (%rbx,%rax,4), %edi ; X64-LFENCE-NEXT: callq sink@PLT ; X64-LFENCE-NEXT: incl %r12d ; X64-LFENCE-NEXT: cmpl %r15d, %r12d @@ -449,17 +449,17 @@ ; X64-LFENCE-NEXT: # %bb.3: # %l2.header.preheader ; X64-LFENCE-NEXT: # in Loop: Header=BB3_2 Depth=1 ; X64-LFENCE-NEXT: lfence -; X64-LFENCE-NEXT: xorl %ebp, %ebp +; X64-LFENCE-NEXT: xorl %r13d, %r13d ; X64-LFENCE-NEXT: .p2align 4, 0x90 ; X64-LFENCE-NEXT: .LBB3_4: # %l2.header ; X64-LFENCE-NEXT: # Parent Loop BB3_2 Depth=1 ; X64-LFENCE-NEXT: # => This Inner Loop Header: Depth=2 ; X64-LFENCE-NEXT: lfence -; X64-LFENCE-NEXT: movslq (%rbx), %rax -; X64-LFENCE-NEXT: movl (%r14,%rax,4), %edi +; X64-LFENCE-NEXT: movslq (%r14), %rax +; X64-LFENCE-NEXT: movl (%rbx,%rax,4), %edi ; X64-LFENCE-NEXT: callq sink@PLT -; X64-LFENCE-NEXT: incl %ebp -; X64-LFENCE-NEXT: cmpl %r13d, %ebp +; X64-LFENCE-NEXT: incl %r13d +; X64-LFENCE-NEXT: cmpl %ebp, %r13d ; X64-LFENCE-NEXT: jl .LBB3_4 ; X64-LFENCE-NEXT: jmp .LBB3_5 entry: @@ -542,13 +542,13 @@ ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx ; X64-NEXT: movq %rsp, %rax -; X64-NEXT: movq %rcx, %r15 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: movq %rcx, %r14 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rsi, %r12 +; X64-NEXT: movq %rdi, %r15 ; X64-NEXT: movq $-1, %r13 ; X64-NEXT: sarq $63, %rax -; X64-NEXT: orq %rax, %r12 +; X64-NEXT: orq %rax, %r15 ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp @@ -559,7 +559,7 @@ ; X64-NEXT: sarq $63, %rax ; X64-NEXT: cmpq $.Lslh_ret_addr7, %rcx ; X64-NEXT: cmovneq %r13, %rax -; X64-NEXT: orq %rax, %rbx +; X64-NEXT: orq %rax, %r12 ; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp @@ -592,9 +592,9 @@ ; X64-NEXT: sarq $63, %rax ; X64-NEXT: cmpq $.Lslh_ret_addr10, %rcx ; X64-NEXT: cmovneq %r13, %rax -; X64-NEXT: orq %rax, %r14 +; X64-NEXT: orq %rax, %rbx ; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: cvtsi2ssl (%r14), %xmm0 +; X64-NEXT: cvtsi2ssl (%rbx), %xmm0 ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_float@PLT @@ -604,9 +604,9 @@ ; X64-NEXT: sarq $63, %rax ; X64-NEXT: cmpq $.Lslh_ret_addr11, %rcx ; X64-NEXT: cmovneq %r13, %rax -; X64-NEXT: orq %rax, %r15 +; X64-NEXT: orq %rax, %r14 ; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: cvtsi2sdq (%r15), %xmm0 +; X64-NEXT: cvtsi2sdq (%r14), %xmm0 ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_double@PLT @@ -617,7 +617,7 @@ ; X64-NEXT: cmpq $.Lslh_ret_addr12, %rcx ; X64-NEXT: cmovneq %r13, %rax ; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: cvtsi2ssq (%r15), %xmm0 +; X64-NEXT: cvtsi2ssq (%r14), %xmm0 ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_float@PLT @@ -628,7 +628,7 @@ ; X64-NEXT: cmpq $.Lslh_ret_addr13, %rcx ; X64-NEXT: cmovneq %r13, %rax ; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: cvtsi2sdl (%r14), %xmm0 +; X64-NEXT: cvtsi2sdl (%rbx), %xmm0 ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_double@PLT @@ -654,9 +654,9 @@ ; X64-LFENCE-NEXT: pushq %r12 ; X64-LFENCE-NEXT: pushq %rbx ; X64-LFENCE-NEXT: pushq %rax -; X64-LFENCE-NEXT: movq %rcx, %r15 -; X64-LFENCE-NEXT: movq %rdx, %r14 -; X64-LFENCE-NEXT: movq %rsi, %rbx +; X64-LFENCE-NEXT: movq %rcx, %r14 +; X64-LFENCE-NEXT: movq %rdx, %rbx +; X64-LFENCE-NEXT: movq %rsi, %r15 ; X64-LFENCE-NEXT: movq %rdi, %r12 ; X64-LFENCE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-LFENCE-NEXT: callq sink_float@PLT @@ -669,16 +669,16 @@ ; X64-LFENCE-NEXT: cvtss2sd %xmm0, %xmm0 ; X64-LFENCE-NEXT: callq sink_double@PLT ; X64-LFENCE-NEXT: xorps %xmm0, %xmm0 -; X64-LFENCE-NEXT: cvtsi2ssl (%r14), %xmm0 +; X64-LFENCE-NEXT: cvtsi2ssl (%rbx), %xmm0 ; X64-LFENCE-NEXT: callq sink_float@PLT ; X64-LFENCE-NEXT: xorps %xmm0, %xmm0 -; X64-LFENCE-NEXT: cvtsi2sdq (%r15), %xmm0 +; X64-LFENCE-NEXT: cvtsi2sdq (%r14), %xmm0 ; X64-LFENCE-NEXT: callq sink_double@PLT ; X64-LFENCE-NEXT: xorps %xmm0, %xmm0 -; X64-LFENCE-NEXT: cvtsi2ssq (%r15), %xmm0 +; X64-LFENCE-NEXT: cvtsi2ssq (%r14), %xmm0 ; X64-LFENCE-NEXT: callq sink_float@PLT ; X64-LFENCE-NEXT: xorps %xmm0, %xmm0 -; X64-LFENCE-NEXT: cvtsi2sdl (%r14), %xmm0 +; X64-LFENCE-NEXT: cvtsi2sdl (%rbx), %xmm0 ; X64-LFENCE-NEXT: callq sink_double@PLT ; X64-LFENCE-NEXT: addq $8, %rsp ; X64-LFENCE-NEXT: popq %rbx @@ -731,11 +731,11 @@ ; X64-NEXT: pushq %rbx ; X64-NEXT: pushq %rax ; X64-NEXT: movq %rsp, %rax -; X64-NEXT: movq %r9, %r14 -; X64-NEXT: movq %r8, %r15 -; X64-NEXT: movq %rcx, %r12 -; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %r9, %rbx +; X64-NEXT: movq %r8, %r14 +; X64-NEXT: movq %rcx, %r15 +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rsi, %r13 ; X64-NEXT: movq $-1, %rbp ; X64-NEXT: sarq $63, %rax ; X64-NEXT: orq %rax, %rdi @@ -749,8 +749,8 @@ ; X64-NEXT: sarq $63, %rax ; X64-NEXT: cmpq $.Lslh_ret_addr15, %rcx ; X64-NEXT: cmovneq %rbp, %rax -; X64-NEXT: orq %rax, %rbx -; X64-NEXT: movaps (%rbx), %xmm0 +; X64-NEXT: orq %rax, %r13 +; X64-NEXT: movaps (%r13), %xmm0 ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_v2f64@PLT @@ -760,8 +760,8 @@ ; X64-NEXT: sarq $63, %rax ; X64-NEXT: cmpq $.Lslh_ret_addr16, %rcx ; X64-NEXT: cmovneq %rbp, %rax -; X64-NEXT: orq %rax, %r13 -; X64-NEXT: movaps (%r13), %xmm0 +; X64-NEXT: orq %rax, %r12 +; X64-NEXT: movaps (%r12), %xmm0 ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_v16i8@PLT @@ -771,8 +771,8 @@ ; X64-NEXT: sarq $63, %rax ; X64-NEXT: cmpq $.Lslh_ret_addr17, %rcx ; X64-NEXT: cmovneq %rbp, %rax -; X64-NEXT: orq %rax, %r12 -; X64-NEXT: movaps (%r12), %xmm0 +; X64-NEXT: orq %rax, %r15 +; X64-NEXT: movaps (%r15), %xmm0 ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_v8i16@PLT @@ -782,8 +782,8 @@ ; X64-NEXT: sarq $63, %rax ; X64-NEXT: cmpq $.Lslh_ret_addr18, %rcx ; X64-NEXT: cmovneq %rbp, %rax -; X64-NEXT: orq %rax, %r15 -; X64-NEXT: movaps (%r15), %xmm0 +; X64-NEXT: orq %rax, %r14 +; X64-NEXT: movaps (%r14), %xmm0 ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_v4i32@PLT @@ -793,8 +793,8 @@ ; X64-NEXT: sarq $63, %rax ; X64-NEXT: cmpq $.Lslh_ret_addr19, %rcx ; X64-NEXT: cmovneq %rbp, %rax -; X64-NEXT: orq %rax, %r14 -; X64-NEXT: movaps (%r14), %xmm0 +; X64-NEXT: orq %rax, %rbx +; X64-NEXT: movaps (%rbx), %xmm0 ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_v2i64@PLT @@ -822,22 +822,22 @@ ; X64-LFENCE-NEXT: pushq %r13 ; X64-LFENCE-NEXT: pushq %r12 ; X64-LFENCE-NEXT: pushq %rbx -; X64-LFENCE-NEXT: movq %r9, %r14 -; X64-LFENCE-NEXT: movq %r8, %r15 -; X64-LFENCE-NEXT: movq %rcx, %r12 -; X64-LFENCE-NEXT: movq %rdx, %r13 -; X64-LFENCE-NEXT: movq %rsi, %rbx +; X64-LFENCE-NEXT: movq %r9, %rbx +; X64-LFENCE-NEXT: movq %r8, %r14 +; X64-LFENCE-NEXT: movq %rcx, %r15 +; X64-LFENCE-NEXT: movq %rdx, %r12 +; X64-LFENCE-NEXT: movq %rsi, %r13 ; X64-LFENCE-NEXT: movaps (%rdi), %xmm0 ; X64-LFENCE-NEXT: callq sink_v4f32@PLT -; X64-LFENCE-NEXT: movaps (%rbx), %xmm0 -; X64-LFENCE-NEXT: callq sink_v2f64@PLT ; X64-LFENCE-NEXT: movaps (%r13), %xmm0 -; X64-LFENCE-NEXT: callq sink_v16i8@PLT +; X64-LFENCE-NEXT: callq sink_v2f64@PLT ; X64-LFENCE-NEXT: movaps (%r12), %xmm0 -; X64-LFENCE-NEXT: callq sink_v8i16@PLT +; X64-LFENCE-NEXT: callq sink_v16i8@PLT ; X64-LFENCE-NEXT: movaps (%r15), %xmm0 -; X64-LFENCE-NEXT: callq sink_v4i32@PLT +; X64-LFENCE-NEXT: callq sink_v8i16@PLT ; X64-LFENCE-NEXT: movaps (%r14), %xmm0 +; X64-LFENCE-NEXT: callq sink_v4i32@PLT +; X64-LFENCE-NEXT: movaps (%rbx), %xmm0 ; X64-LFENCE-NEXT: callq sink_v2i64@PLT ; X64-LFENCE-NEXT: popq %rbx ; X64-LFENCE-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -2360,8 +2360,8 @@ ; CHECK-AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 ; CHECK-AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4 ; CHECK-AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; CHECK-AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 ; CHECK-AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm4 ; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm6 @@ -2383,22 +2383,22 @@ ; CHECK-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7 -; CHECK-AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm0[9],zero,zero,zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,zero,zero,xmm0[15],zero -; CHECK-AVX1-NEXT: vpackuswb %xmm5, %xmm7, %xmm5 -; CHECK-AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 -; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-AVX1-NEXT: vpsraw $8, %xmm6, %xmm6 -; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 -; CHECK-AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-AVX1-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm0[9],zero,zero,zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,zero,zero,xmm0[15],zero +; CHECK-AVX1-NEXT: vpackuswb %xmm8, %xmm7, %xmm7 +; CHECK-AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-AVX1-NEXT: vpsraw $8, %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 -; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6 -; CHECK-AVX1-NEXT: vpsrlw $7, %xmm5, %xmm5 -; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 -; CHECK-AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 -; CHECK-AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 +; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-AVX1-NEXT: vpsraw $8, %xmm8, %xmm8 +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8, %xmm8 +; CHECK-AVX1-NEXT: vpsrlw $8, %xmm8, %xmm8 +; CHECK-AVX1-NEXT: vpackuswb %xmm7, %xmm8, %xmm7 +; CHECK-AVX1-NEXT: vpsrlw $7, %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpand %xmm5, %xmm6, %xmm5 +; CHECK-AVX1-NEXT: vpaddb %xmm5, %xmm7, %xmm5 ; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 ; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6 diff --git a/llvm/test/CodeGen/X86/sse-intel-ocl.ll b/llvm/test/CodeGen/X86/sse-intel-ocl.ll --- a/llvm/test/CodeGen/X86/sse-intel-ocl.ll +++ b/llvm/test/CodeGen/X86/sse-intel-ocl.ll @@ -42,14 +42,14 @@ ; WIN64-NEXT: movaps (%r8), %xmm2 ; WIN64-NEXT: movaps (%rdx), %xmm1 ; WIN64-NEXT: movaps (%rcx), %xmm0 -; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %rax ; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %rax -; WIN64-NEXT: addps (%rax), %xmm0 +; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; WIN64-NEXT: addps (%r8), %xmm0 ; WIN64-NEXT: addps (%rdx), %xmm1 ; WIN64-NEXT: addps (%rcx), %xmm2 -; WIN64-NEXT: addps (%r8), %xmm3 +; WIN64-NEXT: addps (%rax), %xmm3 ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: callq func_float16_ptr ; WIN64-NEXT: addps {{[0-9]+}}(%rsp), %xmm0 diff --git a/llvm/test/CodeGen/X86/sse-regcall.ll b/llvm/test/CodeGen/X86/sse-regcall.ll --- a/llvm/test/CodeGen/X86/sse-regcall.ll +++ b/llvm/test/CodeGen/X86/sse-regcall.ll @@ -243,11 +243,8 @@ ; ; WIN64-LABEL: testi32_inp: ; WIN64: # %bb.0: -; WIN64-NEXT: pushq %r13 -; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx ; WIN64-NEXT: # kill: def $edx killed $edx def $rdx -; WIN64-NEXT: movl %ecx, %ebx ; WIN64-NEXT: # kill: def $esi killed $esi def $rsi ; WIN64-NEXT: # kill: def $r15d killed $r15d def $r15 ; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14 @@ -257,45 +254,40 @@ ; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9 ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi -; WIN64-NEXT: leal (%rdx,%rdi), %r13d +; WIN64-NEXT: leal (%rdx,%rdi), %ebx ; WIN64-NEXT: # kill: def $edx killed $edx killed $rdx ; WIN64-NEXT: subl %edi, %edx -; WIN64-NEXT: leal (%rsi,%r8), %ecx +; WIN64-NEXT: leal (%rsi,%r8), %edi ; WIN64-NEXT: # kill: def $esi killed $esi killed $rsi ; WIN64-NEXT: subl %r8d, %esi ; WIN64-NEXT: leal (%r9,%r10), %r8d -; WIN64-NEXT: movl %r9d, %ebp -; WIN64-NEXT: subl %r10d, %ebp -; WIN64-NEXT: movl %eax, %edi -; WIN64-NEXT: movl %ebx, %r9d -; WIN64-NEXT: subl %ebx, %edi -; WIN64-NEXT: imull %edi, %ebp -; WIN64-NEXT: leal (%r11,%r12), %edi -; WIN64-NEXT: movl %r11d, %ebx -; WIN64-NEXT: subl %r12d, %ebx -; WIN64-NEXT: imull %edx, %ebx -; WIN64-NEXT: addl %ebp, %ebx +; WIN64-NEXT: # kill: def $r9d killed $r9d killed $r9 +; WIN64-NEXT: subl %r10d, %r9d +; WIN64-NEXT: movl %eax, %r10d +; WIN64-NEXT: subl %ecx, %r10d +; WIN64-NEXT: imull %r10d, %r9d +; WIN64-NEXT: leal (%r11,%r12), %r10d +; WIN64-NEXT: # kill: def $r11d killed $r11d killed $r11 +; WIN64-NEXT: subl %r12d, %r11d +; WIN64-NEXT: imull %edx, %r11d +; WIN64-NEXT: addl %r9d, %r11d ; WIN64-NEXT: leal (%r14,%r15), %edx -; WIN64-NEXT: movl %r14d, %ebp -; WIN64-NEXT: subl %r15d, %ebp -; WIN64-NEXT: imull %esi, %ebp -; WIN64-NEXT: addl %ebx, %ebp -; WIN64-NEXT: addl %r9d, %eax +; WIN64-NEXT: movl %r14d, %r9d +; WIN64-NEXT: subl %r15d, %r9d +; WIN64-NEXT: imull %esi, %r9d +; WIN64-NEXT: addl %r11d, %r9d +; WIN64-NEXT: addl %ecx, %eax ; WIN64-NEXT: imull %r8d, %eax -; WIN64-NEXT: imull %r13d, %edi -; WIN64-NEXT: addl %edi, %eax -; WIN64-NEXT: imull %ecx, %edx +; WIN64-NEXT: imull %ebx, %r10d +; WIN64-NEXT: addl %r10d, %eax +; WIN64-NEXT: imull %edi, %edx ; WIN64-NEXT: addl %edx, %eax -; WIN64-NEXT: addl %ebp, %eax +; WIN64-NEXT: addl %r9d, %eax ; WIN64-NEXT: popq %rbx -; WIN64-NEXT: popq %rbp -; WIN64-NEXT: popq %r13 ; WIN64-NEXT: retq ; ; LINUXOSX-LABEL: testi32_inp: ; LINUXOSX: # %bb.0: -; LINUXOSX-NEXT: pushq %rbp -; LINUXOSX-NEXT: pushq %rbx ; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx ; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi ; LINUXOSX-NEXT: # kill: def $r14d killed $r14d def $r14 @@ -305,37 +297,35 @@ ; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8 ; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi ; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d -; LINUXOSX-NEXT: movl %edx, %ebp -; LINUXOSX-NEXT: subl %edi, %ebp -; LINUXOSX-NEXT: leal (%rsi,%r8), %r11d +; LINUXOSX-NEXT: # kill: def $edx killed $edx killed $rdx +; LINUXOSX-NEXT: subl %edi, %edx +; LINUXOSX-NEXT: leal (%rsi,%r8), %edi ; LINUXOSX-NEXT: # kill: def $esi killed $esi killed $rsi ; LINUXOSX-NEXT: subl %r8d, %esi ; LINUXOSX-NEXT: leal (%r9,%r12), %r8d -; LINUXOSX-NEXT: movl %r9d, %edi -; LINUXOSX-NEXT: subl %r12d, %edi -; LINUXOSX-NEXT: movl %eax, %edx -; LINUXOSX-NEXT: subl %ecx, %edx -; LINUXOSX-NEXT: imull %edx, %edi -; LINUXOSX-NEXT: leal (%r13,%r14), %edx -; LINUXOSX-NEXT: movl %r13d, %ebx -; LINUXOSX-NEXT: subl %r14d, %ebx -; LINUXOSX-NEXT: imull %ebp, %ebx -; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %ebp -; LINUXOSX-NEXT: addl %edi, %ebx -; LINUXOSX-NEXT: movl %r15d, %edi -; LINUXOSX-NEXT: subl %ebp, %edi -; LINUXOSX-NEXT: imull %esi, %edi -; LINUXOSX-NEXT: addl %ebx, %edi +; LINUXOSX-NEXT: # kill: def $r9d killed $r9d killed $r9 +; LINUXOSX-NEXT: subl %r12d, %r9d +; LINUXOSX-NEXT: movl %eax, %r11d +; LINUXOSX-NEXT: subl %ecx, %r11d +; LINUXOSX-NEXT: imull %r11d, %r9d +; LINUXOSX-NEXT: leal (%r13,%r14), %r11d +; LINUXOSX-NEXT: movl %r13d, %r12d +; LINUXOSX-NEXT: subl %r14d, %r12d +; LINUXOSX-NEXT: imull %edx, %r12d +; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %edx +; LINUXOSX-NEXT: addl %r9d, %r12d +; LINUXOSX-NEXT: movl %r15d, %r9d +; LINUXOSX-NEXT: subl %edx, %r9d +; LINUXOSX-NEXT: imull %esi, %r9d +; LINUXOSX-NEXT: addl %r12d, %r9d ; LINUXOSX-NEXT: addl %ecx, %eax ; LINUXOSX-NEXT: imull %r8d, %eax -; LINUXOSX-NEXT: imull %r10d, %edx +; LINUXOSX-NEXT: imull %r10d, %r11d +; LINUXOSX-NEXT: addl %r11d, %eax +; LINUXOSX-NEXT: addl %r15d, %edx +; LINUXOSX-NEXT: imull %edi, %edx ; LINUXOSX-NEXT: addl %edx, %eax -; LINUXOSX-NEXT: addl %r15d, %ebp -; LINUXOSX-NEXT: imull %r11d, %ebp -; LINUXOSX-NEXT: addl %ebp, %eax -; LINUXOSX-NEXT: addl %edi, %eax -; LINUXOSX-NEXT: popq %rbx -; LINUXOSX-NEXT: popq %rbp +; LINUXOSX-NEXT: addl %r9d, %eax ; LINUXOSX-NEXT: retq i32 %b1, i32 %b2, i32 %b3, i32 %b4, i32 %b5, i32 %b6) nounwind { %x1 = sub i32 %a1, %a2 diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -3533,10 +3533,10 @@ ; ; X64-AVX1-LABEL: test_mm_set_epi8: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb6,0x54,0x24,0x48] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x50] -; X64-AVX1-NEXT: vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0] -; X64-AVX1-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x01] +; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x48] +; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb6,0x54,0x24,0x50] +; X64-AVX1-NEXT: vmovd %r10d, %xmm0 # encoding: [0xc4,0xc1,0x79,0x6e,0xc2] +; X64-AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x40] ; X64-AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38] @@ -3569,10 +3569,10 @@ ; ; X64-AVX512-LABEL: test_mm_set_epi8: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb6,0x54,0x24,0x48] -; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x50] -; X64-AVX512-NEXT: vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; X64-AVX512-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x01] +; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x48] +; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb6,0x54,0x24,0x50] +; X64-AVX512-NEXT: vmovd %r10d, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc2] +; X64-AVX512-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x40] ; X64-AVX512-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38] @@ -3671,10 +3671,10 @@ ; ; X32-AVX1-LABEL: test_mm_set_epi8: ; X32-AVX1: # %bb.0: -; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb6,0x54,0x24,0x48] -; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x50] -; X32-AVX1-NEXT: vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0] -; X32-AVX1-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x01] +; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x48] +; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb6,0x54,0x24,0x50] +; X32-AVX1-NEXT: vmovd %r10d, %xmm0 # encoding: [0xc4,0xc1,0x79,0x6e,0xc2] +; X32-AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x40] ; X32-AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x38] @@ -3707,10 +3707,10 @@ ; ; X32-AVX512-LABEL: test_mm_set_epi8: ; X32-AVX512: # %bb.0: -; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb6,0x54,0x24,0x48] -; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x50] -; X32-AVX512-NEXT: vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; X32-AVX512-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x01] +; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x48] +; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb6,0x54,0x24,0x50] +; X32-AVX512-NEXT: vmovd %r10d, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc2] +; X32-AVX512-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x40] ; X32-AVX512-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x38] @@ -3837,8 +3837,8 @@ ; ; X64-SSE-LABEL: test_mm_set_epi16: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb7,0x54,0x24,0x10] -; X64-SSE-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x08] +; X64-SSE-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10] +; X64-SSE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb7,0x54,0x24,0x08] ; X64-SSE-NEXT: movd %edi, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc7] ; X64-SSE-NEXT: movd %esi, %xmm1 # encoding: [0x66,0x0f,0x6e,0xce] ; X64-SSE-NEXT: punpcklwd %xmm0, %xmm1 # encoding: [0x66,0x0f,0x61,0xc8] @@ -3853,8 +3853,8 @@ ; X64-SSE-NEXT: movd %r9d, %xmm1 # encoding: [0x66,0x41,0x0f,0x6e,0xc9] ; X64-SSE-NEXT: punpcklwd %xmm0, %xmm1 # encoding: [0x66,0x0f,0x61,0xc8] ; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X64-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] -; X64-SSE-NEXT: movd %r10d, %xmm0 # encoding: [0x66,0x41,0x0f,0x6e,0xc2] +; X64-SSE-NEXT: movd %r10d, %xmm3 # encoding: [0x66,0x41,0x0f,0x6e,0xda] +; X64-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X64-SSE-NEXT: punpcklwd %xmm3, %xmm0 # encoding: [0x66,0x0f,0x61,0xc3] ; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; X64-SSE-NEXT: punpckldq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x62,0xc1] @@ -3893,8 +3893,8 @@ ; ; X32-SSE-LABEL: test_mm_set_epi16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x10] -; X32-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x08] +; X32-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x10] +; X32-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x08] ; X32-SSE-NEXT: movd %edi, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc7] ; X32-SSE-NEXT: movd %esi, %xmm1 # encoding: [0x66,0x0f,0x6e,0xce] ; X32-SSE-NEXT: punpcklwd %xmm0, %xmm1 # encoding: [0x66,0x0f,0x61,0xc8] @@ -3909,8 +3909,8 @@ ; X32-SSE-NEXT: movd %r9d, %xmm1 # encoding: [0x66,0x41,0x0f,0x6e,0xc9] ; X32-SSE-NEXT: punpcklwd %xmm0, %xmm1 # encoding: [0x66,0x0f,0x61,0xc8] ; X32-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] -; X32-SSE-NEXT: movd %r10d, %xmm0 # encoding: [0x66,0x41,0x0f,0x6e,0xc2] +; X32-SSE-NEXT: movd %r10d, %xmm3 # encoding: [0x66,0x41,0x0f,0x6e,0xda] +; X32-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X32-SSE-NEXT: punpcklwd %xmm3, %xmm0 # encoding: [0x66,0x0f,0x61,0xc3] ; X32-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; X32-SSE-NEXT: punpckldq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x62,0xc1] @@ -5282,30 +5282,30 @@ ; ; X64-AVX1-LABEL: test_mm_setr_epi16: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb7,0x54,0x24,0x10] -; X64-AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x08] +; X64-AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10] +; X64-AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb7,0x54,0x24,0x08] ; X64-AVX1-NEXT: vmovd %edi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc7] ; X64-AVX1-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc6,0x01] ; X64-AVX1-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc2,0x02] ; X64-AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x03] ; X64-AVX1-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x04] ; X64-AVX1-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x05] -; X64-AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; X64-AVX1-NEXT: vpinsrw $7, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x07] +; X64-AVX1-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x06] +; X64-AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_setr_epi16: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb7,0x54,0x24,0x10] -; X64-AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x08] +; X64-AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10] +; X64-AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb7,0x54,0x24,0x08] ; X64-AVX512-NEXT: vmovd %edi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7] ; X64-AVX512-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc6,0x01] ; X64-AVX512-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc2,0x02] ; X64-AVX512-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x03] ; X64-AVX512-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x04] ; X64-AVX512-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x05] -; X64-AVX512-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; X64-AVX512-NEXT: vpinsrw $7, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x07] +; X64-AVX512-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x06] +; X64-AVX512-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] ; ; X32-SSE-LABEL: test_mm_setr_epi16: @@ -5338,30 +5338,30 @@ ; ; X32-AVX1-LABEL: test_mm_setr_epi16: ; X32-AVX1: # %bb.0: -; X32-AVX1-NEXT: movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x10] -; X32-AVX1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x08] +; X32-AVX1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x10] +; X32-AVX1-NEXT: movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x08] ; X32-AVX1-NEXT: vmovd %edi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc7] ; X32-AVX1-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc6,0x01] ; X32-AVX1-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc2,0x02] ; X32-AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x03] ; X32-AVX1-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x04] ; X32-AVX1-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x05] -; X32-AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; X32-AVX1-NEXT: vpinsrw $7, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x07] +; X32-AVX1-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x06] +; X32-AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X32-AVX1-NEXT: retq # encoding: [0xc3] ; ; X32-AVX512-LABEL: test_mm_setr_epi16: ; X32-AVX512: # %bb.0: -; X32-AVX512-NEXT: movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x10] -; X32-AVX512-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x08] +; X32-AVX512-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x10] +; X32-AVX512-NEXT: movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x08] ; X32-AVX512-NEXT: vmovd %edi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7] ; X32-AVX512-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc6,0x01] ; X32-AVX512-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc2,0x02] ; X32-AVX512-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x03] ; X32-AVX512-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x04] ; X32-AVX512-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x05] -; X32-AVX512-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; X32-AVX512-NEXT: vpinsrw $7, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x07] +; X32-AVX512-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x06] +; X32-AVX512-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X32-AVX512-NEXT: retq # encoding: [0xc3] %res0 = insertelement <8 x i16> undef, i16 %a0, i32 0 %res1 = insertelement <8 x i16> %res0, i16 %a1, i32 1 diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll --- a/llvm/test/CodeGen/X86/sshl_sat.ll +++ b/llvm/test/CodeGen/X86/sshl_sat.ll @@ -162,16 +162,16 @@ ; X64-NEXT: shlb $4, %dil ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shlb %cl, %al -; X64-NEXT: movzbl %al, %esi -; X64-NEXT: movl %esi, %edx +; X64-NEXT: movzbl %al, %edx +; X64-NEXT: movl %edx, %esi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: sarb %cl, %dl +; X64-NEXT: sarb %cl, %sil ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: testb %dil, %dil ; X64-NEXT: sets %al ; X64-NEXT: addl $127, %eax -; X64-NEXT: cmpb %dl, %dil -; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: cmpb %sil, %dil +; X64-NEXT: cmovel %edx, %eax ; X64-NEXT: sarb $4, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -357,16 +357,16 @@ ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shlb %cl, %al -; X64-NEXT: movzbl %al, %esi -; X64-NEXT: movl %esi, %edx +; X64-NEXT: movzbl %al, %edx +; X64-NEXT: movl %edx, %esi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: sarb %cl, %dl +; X64-NEXT: sarb %cl, %sil ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: testb %dil, %dil ; X64-NEXT: sets %al ; X64-NEXT: addl $127, %eax -; X64-NEXT: cmpb %dl, %dil -; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: cmpb %sil, %dil +; X64-NEXT: cmovel %edx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -967,46 +967,46 @@ ; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pandn %xmm9, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: pandn %xmm9, %xmm10 ; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: pxor %xmm10, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: pxor %xmm4, %xmm9 ; SSE2-NEXT: pand %xmm9, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psubd %xmm5, %xmm4 +; SSE2-NEXT: por %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: psubd %xmm5, %xmm9 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 ; SSE2-NEXT: pxor %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm5 +; SSE2-NEXT: psrad $31, %xmm9 +; SSE2-NEXT: pxor %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psubd %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psubd %xmm6, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 ; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psubd %xmm7, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pandn %xmm5, %xmm6 +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: psubd %xmm7, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pandn %xmm5, %xmm6 +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: por %xmm6, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v16i32: @@ -1017,77 +1017,77 @@ ; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 ; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pandn %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm10 +; SSSE3-NEXT: pandn %xmm9, %xmm10 ; SSSE3-NEXT: psrad $31, %xmm9 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: pxor %xmm10, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: pxor %xmm4, %xmm9 ; SSSE3-NEXT: pand %xmm9, %xmm0 -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: psubd %xmm5, %xmm4 +; SSSE3-NEXT: por %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm9 +; SSSE3-NEXT: psubd %xmm5, %xmm9 ; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 ; SSSE3-NEXT: pxor %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm9, %xmm5 +; SSSE3-NEXT: psrad $31, %xmm9 +; SSSE3-NEXT: pxor %xmm4, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm1 ; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: psubd %xmm6, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: psubd %xmm6, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 ; SSSE3-NEXT: pxor %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: psubd %xmm7, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSSE3-NEXT: pandn %xmm5, %xmm6 +; SSSE3-NEXT: psrad $31, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: psubd %xmm7, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3 ; SSSE3-NEXT: pxor %xmm7, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: por %xmm5, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: pandn %xmm5, %xmm6 +; SSSE3-NEXT: psrad $31, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: por %xmm6, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v16i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm3, %xmm8 +; SSE41-NEXT: movdqa %xmm3, %xmm11 ; SSE41-NEXT: movdqa %xmm2, %xmm10 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm11, %xmm11 -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: psubd %xmm4, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm9 +; SSE41-NEXT: pxor %xmm12, %xmm12 +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: psubd %xmm4, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm9, %xmm1 +; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] ; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm8 +; SSE41-NEXT: movdqa %xmm9, %xmm1 ; SSE41-NEXT: psubd %xmm5, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm9 +; SSE41-NEXT: pxor %xmm5, %xmm9 ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 ; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm10, %xmm2 ; SSE41-NEXT: psubd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm6 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm10 ; SSE41-NEXT: pxor %xmm6, %xmm10 ; SSE41-NEXT: movdqa %xmm2, %xmm3 @@ -1095,17 +1095,17 @@ ; SSE41-NEXT: pxor %xmm4, %xmm3 ; SSE41-NEXT: movdqa %xmm10, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: movdqa %xmm11, %xmm3 ; SSE41-NEXT: psubd %xmm7, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm8 -; SSE41-NEXT: pxor %xmm7, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE41-NEXT: pxor %xmm7, %xmm11 ; SSE41-NEXT: movdqa %xmm3, %xmm5 ; SSE41-NEXT: psrad $31, %xmm5 ; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm3 -; SSE41-NEXT: movaps %xmm9, %xmm0 +; SSE41-NEXT: movaps %xmm8, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: v16i32: @@ -1341,61 +1341,61 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE2-LABEL: v4i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 ; SSE2-NEXT: psubq %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 ; SSE2-NEXT: movdqa %xmm5, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm7, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm0, %xmm6 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: psubq %xmm3, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: psubq %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm3 ; SSE2-NEXT: psrad $31, %xmm1 @@ -1407,61 +1407,61 @@ ; ; SSSE3-LABEL: v4i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 ; SSSE3-NEXT: psubq %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm6 ; SSSE3-NEXT: movdqa %xmm5, %xmm7 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pand %xmm7, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pandn %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pandn %xmm0, %xmm6 ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm4 -; SSSE3-NEXT: psubq %xmm3, %xmm1 +; SSSE3-NEXT: por %xmm6, %xmm0 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pxor %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: psubq %xmm3, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pxor %xmm8, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm3 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: pxor %xmm6, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: pandn %xmm1, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -1474,25 +1474,25 @@ ; SSE41-LABEL: v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm6, %xmm0 ; SSE41-NEXT: psubq %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pxor %xmm9, %xmm5 +; SSE41-NEXT: pxor %xmm6, %xmm5 ; SSE41-NEXT: movdqa %xmm0, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm6 -; SSE41-NEXT: por %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm9, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm8 +; SSE41-NEXT: por %xmm0, %xmm8 +; SSE41-NEXT: pxor %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] ; SSE41-NEXT: pand %xmm0, %xmm5 ; SSE41-NEXT: por %xmm2, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm5 +; SSE41-NEXT: pxor %xmm8, %xmm5 ; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807] ; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: movapd %xmm7, %xmm2 @@ -1501,24 +1501,24 @@ ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 ; SSE41-NEXT: psubq %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm9, %xmm2 +; SSE41-NEXT: pxor %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm6 -; SSE41-NEXT: por %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm9, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm9 +; SSE41-NEXT: por %xmm0, %xmm9 +; SSE41-NEXT: pxor %xmm6, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm0, %xmm2 ; SSE41-NEXT: por %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: pxor %xmm9, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -1611,62 +1611,62 @@ ; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] -; SSE2-NEXT: por %xmm12, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm10, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 ; SSE2-NEXT: pandn %xmm0, %xmm10 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm9, %xmm0 ; SSE2-NEXT: por %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: psubq %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm10 ; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: psubq %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm10, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm11 +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm9 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm10 +; SSE2-NEXT: por %xmm9, %xmm10 ; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: pand %xmm11, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm10, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm5 ; SSE2-NEXT: pandn %xmm1, %xmm5 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm9, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: psubq %xmm6, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: psubq %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] ; SSE2-NEXT: pand %xmm11, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm9 ; SSE2-NEXT: pxor %xmm8, %xmm6 ; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm5 @@ -1676,44 +1676,44 @@ ; SSE2-NEXT: pand %xmm10, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: psubq %xmm7, %xmm3 +; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm4 +; SSE2-NEXT: psubq %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: pand %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm3, %xmm6 ; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: por %xmm6, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i64: @@ -1737,62 +1737,62 @@ ; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 ; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm11, %xmm12 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] -; SSSE3-NEXT: por %xmm12, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm11, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm9 +; SSSE3-NEXT: pxor %xmm10, %xmm9 +; SSSE3-NEXT: movdqa %xmm9, %xmm10 ; SSSE3-NEXT: pandn %xmm0, %xmm10 ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pxor %xmm9, %xmm0 -; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: pand %xmm9, %xmm0 ; SSSE3-NEXT: por %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm9 +; SSSE3-NEXT: pxor %xmm8, %xmm9 +; SSSE3-NEXT: psubq %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm10 ; SSSE3-NEXT: pxor %xmm8, %xmm10 -; SSSE3-NEXT: psubq %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm4 -; SSSE3-NEXT: movdqa %xmm10, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm11 +; SSSE3-NEXT: movdqa %xmm9, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11 ; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm12, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSSE3-NEXT: pand %xmm12, %xmm9 ; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm10 +; SSSE3-NEXT: por %xmm9, %xmm10 ; SSSE3-NEXT: pxor %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSSE3-NEXT: pand %xmm11, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm9 +; SSSE3-NEXT: pxor %xmm10, %xmm9 +; SSSE3-NEXT: movdqa %xmm9, %xmm5 ; SSSE3-NEXT: pandn %xmm1, %xmm5 ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm9, %xmm1 ; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm10 -; SSSE3-NEXT: pxor %xmm8, %xmm10 -; SSSE3-NEXT: psubq %xmm6, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 ; SSSE3-NEXT: pxor %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm10, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: psubq %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm9 +; SSSE3-NEXT: pxor %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm5, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] ; SSSE3-NEXT: pand %xmm11, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm9 ; SSSE3-NEXT: pxor %xmm8, %xmm6 ; SSSE3-NEXT: movdqa %xmm6, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 @@ -1802,44 +1802,44 @@ ; SSSE3-NEXT: pand %xmm10, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pandn %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm9, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pandn %xmm2, %xmm6 ; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 ; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: por %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm4 -; SSSE3-NEXT: psubq %xmm7, %xmm3 +; SSSE3-NEXT: por %xmm6, %xmm2 ; SSSE3-NEXT: movdqa %xmm3, %xmm5 ; SSSE3-NEXT: pxor %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm4 +; SSSE3-NEXT: psubq %xmm7, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm10, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 ; SSSE3-NEXT: pxor %xmm8, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm7, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pandn %xmm3, %xmm5 +; SSSE3-NEXT: pand %xmm9, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm5 +; SSSE3-NEXT: pxor %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pandn %xmm3, %xmm6 ; SSSE3-NEXT: psrad $31, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm3 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: por %xmm5, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: por %xmm6, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i64: @@ -1900,12 +1900,12 @@ ; SSE41-NEXT: psubq %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm5 -; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm9 +; SSE41-NEXT: por %xmm0, %xmm9 ; SSE41-NEXT: pxor %xmm10, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 @@ -1913,7 +1913,7 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm0, %xmm4 ; SSE41-NEXT: por %xmm6, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm4 ; SSE41-NEXT: movapd %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 @@ -2023,66 +2023,62 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; SSE-LABEL: v2i128: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbx ; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: subq {{[0-9]+}}(%rsp), %rcx ; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %r8 -; SSE-NEXT: seto %r10b -; SSE-NEXT: movq %r8, %rbx -; SSE-NEXT: sarq $63, %rbx -; SSE-NEXT: testb %r10b, %r10b -; SSE-NEXT: cmovneq %rbx, %rcx +; SSE-NEXT: seto %dil +; SSE-NEXT: movq %r8, %r10 +; SSE-NEXT: sarq $63, %r10 +; SSE-NEXT: testb %dil, %dil +; SSE-NEXT: cmovneq %r10, %rcx ; SSE-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 -; SSE-NEXT: xorq %r11, %rbx -; SSE-NEXT: testb %r10b, %r10b -; SSE-NEXT: cmoveq %r8, %rbx +; SSE-NEXT: xorq %r11, %r10 +; SSE-NEXT: testb %dil, %dil +; SSE-NEXT: cmoveq %r8, %r10 ; SSE-NEXT: subq %r9, %rsi ; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx -; SSE-NEXT: seto %r8b -; SSE-NEXT: movq %rdx, %rdi -; SSE-NEXT: sarq $63, %rdi -; SSE-NEXT: testb %r8b, %r8b -; SSE-NEXT: cmovneq %rdi, %rsi -; SSE-NEXT: xorq %r11, %rdi -; SSE-NEXT: testb %r8b, %r8b -; SSE-NEXT: cmoveq %rdx, %rdi +; SSE-NEXT: seto %dil +; SSE-NEXT: movq %rdx, %r8 +; SSE-NEXT: sarq $63, %r8 +; SSE-NEXT: testb %dil, %dil +; SSE-NEXT: cmovneq %r8, %rsi +; SSE-NEXT: xorq %r11, %r8 +; SSE-NEXT: testb %dil, %dil +; SSE-NEXT: cmoveq %rdx, %r8 ; SSE-NEXT: movq %rcx, 16(%rax) ; SSE-NEXT: movq %rsi, (%rax) -; SSE-NEXT: movq %rbx, 24(%rax) -; SSE-NEXT: movq %rdi, 8(%rax) -; SSE-NEXT: popq %rbx +; SSE-NEXT: movq %r10, 24(%rax) +; SSE-NEXT: movq %r8, 8(%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: v2i128: ; AVX: # %bb.0: -; AVX-NEXT: pushq %rbx ; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %r8 -; AVX-NEXT: seto %r10b -; AVX-NEXT: movq %r8, %rbx -; AVX-NEXT: sarq $63, %rbx -; AVX-NEXT: testb %r10b, %r10b -; AVX-NEXT: cmovneq %rbx, %rcx +; AVX-NEXT: seto %dil +; AVX-NEXT: movq %r8, %r10 +; AVX-NEXT: sarq $63, %r10 +; AVX-NEXT: testb %dil, %dil +; AVX-NEXT: cmovneq %r10, %rcx ; AVX-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 -; AVX-NEXT: xorq %r11, %rbx -; AVX-NEXT: testb %r10b, %r10b -; AVX-NEXT: cmoveq %r8, %rbx +; AVX-NEXT: xorq %r11, %r10 +; AVX-NEXT: testb %dil, %dil +; AVX-NEXT: cmoveq %r8, %r10 ; AVX-NEXT: subq %r9, %rsi ; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx -; AVX-NEXT: seto %r8b -; AVX-NEXT: movq %rdx, %rdi -; AVX-NEXT: sarq $63, %rdi -; AVX-NEXT: testb %r8b, %r8b -; AVX-NEXT: cmovneq %rdi, %rsi -; AVX-NEXT: xorq %r11, %rdi -; AVX-NEXT: testb %r8b, %r8b -; AVX-NEXT: cmoveq %rdx, %rdi +; AVX-NEXT: seto %dil +; AVX-NEXT: movq %rdx, %r8 +; AVX-NEXT: sarq $63, %r8 +; AVX-NEXT: testb %dil, %dil +; AVX-NEXT: cmovneq %r8, %rsi +; AVX-NEXT: xorq %r11, %r8 +; AVX-NEXT: testb %dil, %dil +; AVX-NEXT: cmoveq %rdx, %r8 ; AVX-NEXT: movq %rcx, 16(%rax) ; AVX-NEXT: movq %rsi, (%rax) -; AVX-NEXT: movq %rbx, 24(%rax) -; AVX-NEXT: movq %rdi, 8(%rax) -; AVX-NEXT: popq %rbx +; AVX-NEXT: movq %r10, 24(%rax) +; AVX-NEXT: movq %r8, 8(%rax) ; AVX-NEXT: retq %z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z diff --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir --- a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir +++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir @@ -328,7 +328,6 @@ ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: TEST64rr [[COPY2]], [[COPY2]], implicit-def $eflags ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[COPY2]] - ; CHECK-NEXT: [[MOV32ri1:%[0-9]+]]:gr32 = MOV32ri -1 ; CHECK-NEXT: JCC_1 %bb.9, 4, implicit killed $eflags ; CHECK-NEXT: JMP_1 %bb.6 ; CHECK-NEXT: {{ $}} @@ -342,8 +341,8 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY [[NOT64r1]] ; CHECK-NEXT: [[OR32ri:%[0-9]+]]:gr32 = OR32ri [[OR32ri]], 268435456, implicit-def dead $eflags ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr32 = COPY [[OR32ri]] - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY [[COPY3]] ; CHECK-NEXT: [[DEF:%[0-9]+]]:gr64 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY [[COPY3]] ; CHECK-NEXT: undef %81.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.7.bb33: @@ -353,9 +352,9 @@ ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm undef %59:gr64, 1, $noreg, 0, $noreg :: (load unordered (s64) from `i8 addrspace(1)* addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: [[NOT64r2:%[0-9]+]]:gr64 = NOT64r [[NOT64r2]] ; CHECK-NEXT: CMP64rr [[NOT64r2]], [[COPY6]], implicit-def $eflags - ; CHECK-NEXT: undef %102.sub_32bit:gr64_with_sub_8bit = MOV32ri 0 - ; CHECK-NEXT: [[CMOV64rr:%[0-9]+]]:gr64 = CMOV64rr [[CMOV64rr]], %102, 4, implicit killed $eflags - ; CHECK-NEXT: INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4456457 /* reguse:GR64 */, %102, 4456457 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags + ; CHECK-NEXT: undef %100.sub_32bit:gr64_with_sub_8bit = MOV32ri 0 + ; CHECK-NEXT: [[CMOV64rr:%[0-9]+]]:gr64 = CMOV64rr [[CMOV64rr]], %100, 4, implicit killed $eflags + ; CHECK-NEXT: INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4456457 /* reguse:GR64 */, %100, 4456457 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags ; CHECK-NEXT: LCMPXCHG32 undef %67:gr64, 1, $noreg, 0, $noreg, [[COPY5]], implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: $rdi = COPY [[COPY4]] @@ -383,7 +382,7 @@ ; CHECK-NEXT: bb.9.bb64: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: LCMPXCHG32 undef %76:gr64, 1, $noreg, 0, $noreg, [[MOV32ri1]], implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: LCMPXCHG32 undef %76:gr64, 1, $noreg, 0, $noreg, [[MOV32ri]], implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: STATEPOINT 2, 5, 1, undef %79:gr64, undef $rdi, 2, 0, 2, 0, 2, 27, 2, 0, 2, 2, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 133, 2, 0, 2, 5, 2, 1, 2, 7, 2, 0, 2, 8, 2, 2, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 8, 2, 2, 2, 0, 2, 0, 2, 0, csr_64, implicit-def $rsp, implicit-def $ssp ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp diff --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra-inline-spiller.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra-inline-spiller.mir --- a/llvm/test/CodeGen/X86/statepoint-invoke-ra-inline-spiller.mir +++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra-inline-spiller.mir @@ -212,8 +212,7 @@ ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-NEXT: undef %75.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags - ; CHECK-NEXT: MOV64mr %stack.2, 1, $noreg, 0, $noreg, %75 :: (store (s64) into %stack.2) + ; CHECK-NEXT: undef %39.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi ; CHECK-NEXT: STATEPOINT 2, 5, 2, undef %24:gr64, $rdi, undef $rsi, 2, 0, 2, 0, 2, 37, 2, 0, 2, 2, 2, 0, 2, 43, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 6, 2, 0, 2, 4, 2, 1, 2, 0, 2, 0, 2, 7, 2, 0, 2, 0, 2, 0, 2, 7, 2, 0, 2, 0, 2, 0, 2, 2, 2, 4, 2, 5, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 1, 2, 0, 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp @@ -238,9 +237,7 @@ ; CHECK-NEXT: $esi = COPY %66.sub_32bit ; CHECK-NEXT: $edx = COPY [[LEA64_32r]] ; CHECK-NEXT: $r8d = COPY [[MOV32rm]] - ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %stack.2) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY [[MOV64rm]] - ; CHECK-NEXT: [[STATEPOINT:%[0-9]+]]:gr64, [[STATEPOINT1:%[0-9]+]]:gr64, [[STATEPOINT2:%[0-9]+]]:gr64, [[STATEPOINT3:%[0-9]+]]:gr64 = STATEPOINT 2, 5, 5, undef %35:gr64, $rdi, $esi, $edx, undef $rcx, $r8d, 2, 0, 2, 0, 2, 85, 2, 0, 2, 2, 2, 0, 2, 43, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 10, 2, 5, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT3]], 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 2, 2, 11, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 15, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 21, 2, 63, 2, 0, 2, 9, 2, 1, 2, 0, [[STATEPOINT2]], 2, 0, [[STATEPOINT1]], 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 3, 1, 4, %stack.0, 0, 2, 3, 2, 4278124286, 2, 3, 1, 4, %stack.1, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 4, [[STATEPOINT]](tied-def 0), [[STATEPOINT1]](tied-def 1), [[STATEPOINT2]](tied-def 2), [[STATEPOINT3]](tied-def 3), 2, 0, 2, 4, 0, 0, 1, 1, 2, 2, 3, 3, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s32) on %stack.0), (volatile load store (s32) on %stack.1) + ; CHECK-NEXT: [[STATEPOINT:%[0-9]+]]:gr64, [[STATEPOINT1:%[0-9]+]]:gr64, [[STATEPOINT2:%[0-9]+]]:gr64, [[STATEPOINT3:%[0-9]+]]:gr64_with_sub_8bit = STATEPOINT 2, 5, 5, undef %35:gr64, $rdi, $esi, $edx, undef $rcx, $r8d, 2, 0, 2, 0, 2, 85, 2, 0, 2, 2, 2, 0, 2, 43, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 10, 2, 5, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT3]], 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 2, 2, 11, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 15, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 21, 2, 63, 2, 0, 2, 9, 2, 1, 2, 0, [[STATEPOINT2]], 2, 0, [[STATEPOINT1]], 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 3, 1, 4, %stack.0, 0, 2, 3, 2, 4278124286, 2, 3, 1, 4, %stack.1, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 4, [[STATEPOINT]](tied-def 0), [[STATEPOINT1]](tied-def 1), [[STATEPOINT2]](tied-def 2), [[STATEPOINT3]](tied-def 3), 2, 0, 2, 4, 0, 0, 1, 1, 2, 2, 3, 3, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s32) on %stack.0), (volatile load store (s32) on %stack.1) ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: CMP32rr %65.sub_32bit, undef %37:gr32, implicit-def $eflags ; CHECK-NEXT: JCC_1 %bb.4, 13, implicit killed $eflags @@ -248,10 +245,8 @@ ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY [[STATEPOINT3]] - ; CHECK-NEXT: MOV64mr %stack.2, 1, $noreg, 0, $noreg, [[COPY]] :: (store (s64) into %stack.2) ; CHECK-NEXT: [[DEF1:%[0-9]+]]:gr64 = IMPLICIT_DEF - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY [[LEA64_32r]] + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY [[LEA64_32r]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.bb21: ; CHECK-NEXT: successors: %bb.1(0x80000000) @@ -265,8 +260,8 @@ ; CHECK-NEXT: EH_LABEL ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: $ecx = MOV32r0 implicit-def dead $eflags - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY [[LEA64_32r]] - ; CHECK-NEXT: [[STATEPOINT2]]:gr64, [[STATEPOINT3]]:gr64, [[STATEPOINT]]:gr64, dead [[STATEPOINT1]]:gr64 = STATEPOINT 1, 16, 5, undef %47:gr64, undef $edi, undef $rsi, undef $rdx, $ecx, undef $r8d, 2, 0, 2, 0, 2, 99, 2, 0, 2, 2, 2, 0, 2, 43, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 10, 2, 5, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT3]], 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 2, 2, 11, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 15, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 21, 2, 96, 2, 0, 2, 9, 2, 1, 2, 0, [[STATEPOINT2]], 2, 0, [[STATEPOINT1]], 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 3, 1, 4, %stack.0, 0, 2, 3, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 8, 2, 12, 2, 34, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT2]], 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 5, [[STATEPOINT2]](tied-def 0), [[STATEPOINT3]](tied-def 1), [[STATEPOINT]](tied-def 2), [[STATEPOINT1]](tied-def 3), 2, 4278124286, 2, 0, 2, 5, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s32) on %stack.0) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY [[LEA64_32r]] + ; CHECK-NEXT: [[STATEPOINT2]]:gr64, [[STATEPOINT3]]:gr64_with_sub_8bit, [[STATEPOINT]]:gr64, dead [[STATEPOINT1]]:gr64 = STATEPOINT 1, 16, 5, undef %47:gr64, undef $edi, undef $rsi, undef $rdx, $ecx, undef $r8d, 2, 0, 2, 0, 2, 99, 2, 0, 2, 2, 2, 0, 2, 43, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 10, 2, 5, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT3]], 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 2, 2, 11, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 15, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 21, 2, 96, 2, 0, 2, 9, 2, 1, 2, 0, [[STATEPOINT2]], 2, 0, [[STATEPOINT1]], 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 3, 1, 4, %stack.0, 0, 2, 3, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 8, 2, 12, 2, 34, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT2]], 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 5, [[STATEPOINT2]](tied-def 0), [[STATEPOINT3]](tied-def 1), [[STATEPOINT]](tied-def 2), [[STATEPOINT1]](tied-def 3), 2, 4278124286, 2, 0, 2, 5, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s32) on %stack.0) ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: EH_LABEL ; CHECK-NEXT: JMP_1 %bb.5 @@ -274,9 +269,7 @@ ; CHECK-NEXT: bb.5.bb30: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[STATEPOINT3]] - ; CHECK-NEXT: MOV64mr %stack.2, 1, $noreg, 0, $noreg, [[COPY3]] :: (store (s64) into %stack.2) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[STATEPOINT2]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY [[STATEPOINT2]] ; CHECK-NEXT: [[ADD64ri8_:%[0-9]+]]:gr64 = nuw ADD64ri8 [[ADD64ri8_]], 28, implicit-def dead $eflags ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @barney, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp @@ -292,7 +285,7 @@ ; CHECK-NEXT: EH_LABEL ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: $edi = MOV32ri 3 - ; CHECK-NEXT: dead [[STATEPOINT3]]:gr64, dead [[DEF]]:gr64 = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @wombat, $edi, 2, 0, 2, 2, 2, 97, 2, 0, 2, 2, 2, 0, 2, 43, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 10, 2, 5, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT3]], 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 2, 2, 11, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 15, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 21, 2, 96, 2, 0, 2, 9, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 3, [[COPY2]], 2, 3, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 0, 2, 12, 2, 51, 2, 0, 2, 3, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 3, [[STATEPOINT3]](tied-def 0), 2, 4278124286, [[DEF]](tied-def 1), 2, 0, 2, 3, 0, 0, 1, 1, 2, 2, csr_64, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: dead [[STATEPOINT3]]:gr64_with_sub_8bit, dead [[DEF]]:gr64 = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @wombat, $edi, 2, 0, 2, 2, 2, 97, 2, 0, 2, 2, 2, 0, 2, 43, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 1, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 10, 2, 5, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT3]], 2, 0, [[STATEPOINT3]], 2, 7, 2, 0, 2, 0, [[STATEPOINT3]], 2, 2, 2, 11, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 15, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 21, 2, 96, 2, 0, 2, 9, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 3, [[COPY1]], 2, 3, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 0, 2, 12, 2, 51, 2, 0, 2, 3, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 3, [[STATEPOINT3]](tied-def 0), 2, 4278124286, [[DEF]](tied-def 1), 2, 0, 2, 3, 0, 0, 1, 1, 2, 2, csr_64, implicit-def $rsp, implicit-def $ssp ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp bb.0.bb: successors: %bb.1(0x80000000) diff --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra-remove-back-copies.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra-remove-back-copies.mir --- a/llvm/test/CodeGen/X86/statepoint-invoke-ra-remove-back-copies.mir +++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra-remove-back-copies.mir @@ -273,7 +273,7 @@ ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm undef %17:gr64, 1, $noreg, 0, $noreg :: (load unordered (s64) from `i8 addrspace(1)* addrspace(1)* undef`, addrspace 1) - ; CHECK-NEXT: [[NOT64r:%[0-9]+]]:gr64 = NOT64r [[NOT64r]] + ; CHECK-NEXT: [[NOT64r:%[0-9]+]]:gr64 = NOT64r [[MOV64rm]] ; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, [[NOT64r]] :: (store (s64) into %stack.1) ; CHECK-NEXT: undef %48.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags ; CHECK-NEXT: [[DEF:%[0-9]+]]:gr64 = IMPLICIT_DEF @@ -332,8 +332,7 @@ ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %stack.1) ; CHECK-NEXT: dead $edx = MOV32r0 implicit-def dead $eflags, implicit-def $rdx ; CHECK-NEXT: $ecx = MOV32r0 implicit-def dead $eflags - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY [[DEF2]] - ; CHECK-NEXT: dead [[MOV64rm]]:gr64, dead [[COPY1]]:gr64, dead [[DEF1]]:gr64, dead [[DEF]]:gr64 = STATEPOINT 1, 16, 5, undef %41:gr64, undef $edi, undef $rsi, $rdx, $ecx, undef $r8d, 2, 0, 2, 0, 2, 89, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[COPY1]], 2, 7, 2, 0, 2, 0, [[COPY1]], 2, 10, 2, 2, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[COPY1]], 2, 0, [[COPY1]], 2, 7, 2, 0, 2, 0, [[COPY1]], 2, 2, 2, 8, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 12, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 18, 2, 96, 2, 0, 2, 9, 2, 1, 2, 0, [[DEF1]], 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 3, 1, 4, %stack.0, 0, 2, 3, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 8, 2, 9, 2, 34, 2, 0, 2, 3, 2, 1, 2, 0, [[DEF1]], 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 5, [[MOV64rm]](tied-def 0), [[COPY1]](tied-def 1), [[DEF1]](tied-def 2), 2, 4278124286, [[DEF]](tied-def 3), 2, 0, 2, 5, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s32) on %stack.0) + ; CHECK-NEXT: dead [[MOV64rm]]:gr64, dead [[DEF2]]:gr64_with_sub_8bit, dead [[DEF1]]:gr64, dead [[DEF]]:gr64 = STATEPOINT 1, 16, 5, undef %41:gr64, undef $edi, undef $rsi, $rdx, $ecx, undef $r8d, 2, 0, 2, 0, 2, 89, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF2]], 2, 7, 2, 0, 2, 0, [[DEF2]], 2, 10, 2, 2, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, [[DEF2]], 2, 0, [[DEF2]], 2, 7, 2, 0, 2, 0, [[DEF2]], 2, 2, 2, 8, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 12, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 18, 2, 96, 2, 0, 2, 9, 2, 1, 2, 0, [[DEF1]], 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 3, 1, 4, %stack.0, 0, 2, 3, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[DEF]], 2, 8, 2, 9, 2, 34, 2, 0, 2, 3, 2, 1, 2, 0, [[DEF1]], 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 5, [[MOV64rm]](tied-def 0), [[DEF2]](tied-def 1), [[DEF1]](tied-def 2), 2, 4278124286, [[DEF]](tied-def 3), 2, 0, 2, 5, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s32) on %stack.0) ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: EH_LABEL ; CHECK-NEXT: JMP_1 %bb.10 @@ -342,14 +341,14 @@ ; CHECK-NEXT: successors: %bb.11(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF3:%[0-9]+]]:gr64 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:gr64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:gr64 = IMPLICIT_DEF ; CHECK-NEXT: undef [[DEF2]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.11.bb27: ; CHECK-NEXT: successors: %bb.2(0x80000000), %bb.17(0x00000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TEST32rr [[ADD32rr]], [[ADD32rr]], implicit-def $eflags - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY [[ADD32rr]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY [[ADD32rr]] ; CHECK-NEXT: JCC_1 %bb.2, 8, implicit $eflags ; CHECK-NEXT: JMP_1 %bb.17 ; CHECK-NEXT: {{ $}} @@ -387,13 +386,12 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: $edi = MOV32ri -39 - ; CHECK-NEXT: STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 103, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 10, 2, 2, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 2, 2, 8, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 12, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 18, 2, 63, 2, 0, 2, 9, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 3, [[COPY2]], 2, 3, 2, 0, 2, 3, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 2, 2, 33, 2, 6, 2, 0, 2, 5, 2, 0, 2, 0, 2, 4278124286, 2, 3, [[COPY2]], 2, 3, 2, 4278124286, 2, 0, 2, 4278124286, 2, 3, 2, 4278124286, 2, 1, 2, 34, 2, 14, 2, 0, 2, 3, 2, 0, 2, 3, [[COPY2]], 2, 3, 2, 4278124286, 2, 3, 2, 0, 2, 1, 2, 4278124286, 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 103, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 10, 2, 2, 2, 12, 2, 0, 2, 3, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 2, 2, 8, 2, 4, 2, 0, 2, 1, 2, 0, 2, 7, 2, 0, 2, 2, 2, 12, 2, 7, 2, 0, 2, 2, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 10, 2, 18, 2, 63, 2, 0, 2, 9, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 3, [[COPY1]], 2, 3, 2, 0, 2, 3, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 2, 2, 33, 2, 6, 2, 0, 2, 5, 2, 0, 2, 0, 2, 4278124286, 2, 3, [[COPY1]], 2, 3, 2, 4278124286, 2, 0, 2, 4278124286, 2, 3, 2, 4278124286, 2, 1, 2, 34, 2, 14, 2, 0, 2, 3, 2, 0, 2, 3, [[COPY1]], 2, 3, 2, 4278124286, 2, 3, 2, 0, 2, 1, 2, 4278124286, 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.17.bb44: ; CHECK-NEXT: successors: %bb.22(0x40000000), %bb.18(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[DEF2]] ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @hoge.1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp @@ -420,7 +418,7 @@ ; CHECK-NEXT: EH_LABEL ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: $ecx = MOV32r0 implicit-def dead $eflags - ; CHECK-NEXT: [[STATEPOINT:%[0-9]+]]:gr64 = STATEPOINT 1, 16, 5, undef %60:gr64, undef $edi, undef $rsi, undef $rdx, $ecx, undef $r8d, 2, 0, 2, 0, 2, 45, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 10, 2, 2, 2, 19, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 8, 2, 9, 2, 34, 2, 0, 2, 3, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 2, [[STATEPOINT]](tied-def 0), 2, 4278124286, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: [[STATEPOINT:%[0-9]+]]:gr64_with_sub_8bit = STATEPOINT 1, 16, 5, undef %60:gr64, undef $edi, undef $rsi, undef $rdx, $ecx, undef $r8d, 2, 0, 2, 0, 2, 45, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 10, 2, 2, 2, 19, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 8, 2, 9, 2, 34, 2, 0, 2, 3, 2, 1, 2, 0, 2, 4278124286, 2, 0, 2, 4278124286, 2, 7, 2, 0, 2, 0, 2, 4278124286, 2, 2, [[STATEPOINT]](tied-def 0), 2, 4278124286, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: EH_LABEL ; CHECK-NEXT: JMP_1 %bb.21 @@ -435,7 +433,7 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: $edi = MOV32ri 10 - ; CHECK-NEXT: dead [[STATEPOINT]]:gr64 = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 45, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 10, 2, 2, 2, 19, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 1, 2, 9, 2, 6, 2, 1, 2, 3, 2, 0, 2, 0, 2, 0, 2, 0, 2, 4278124286, 2, 0, 2, 0, 2, 7, 2, 0, 2, 3, [[STATEPOINT]](tied-def 0), 2, 0, 2, 4278124286, 2, 0, 2, 3, 0, 0, 1, 1, 2, 2, csr_64, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: dead [[STATEPOINT]]:gr64_with_sub_8bit = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 45, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 10, 2, 2, 2, 19, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 1, 2, 9, 2, 6, 2, 1, 2, 3, 2, 0, 2, 0, 2, 0, 2, 0, 2, 4278124286, 2, 0, 2, 0, 2, 7, 2, 0, 2, 3, [[STATEPOINT]](tied-def 0), 2, 0, 2, 4278124286, 2, 0, 2, 3, 0, 0, 1, 1, 2, 2, csr_64, implicit-def $rsp, implicit-def $ssp ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.23.bb52 (landing-pad): @@ -445,13 +443,13 @@ ; CHECK-NEXT: EH_LABEL ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: $edi = MOV32ri 3 - ; CHECK-NEXT: dead [[STATEPOINT]]:gr64 = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 43, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 10, 2, 2, 2, 19, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 0, 2, 9, 2, 51, 2, 0, 2, 3, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 1, [[STATEPOINT]](tied-def 0), 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: dead [[STATEPOINT]]:gr64_with_sub_8bit = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 43, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 10, 2, 2, 2, 19, 2, 0, 2, 3, 2, 1, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 0, 2, 9, 2, 51, 2, 0, 2, 3, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 7, 2, 0, 2, 1, [[STATEPOINT]](tied-def 0), 2, 0, 2, 1, 0, 0, csr_64, implicit-def $rsp, implicit-def $ssp ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.24.bb56: ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: $edi = MOV32ri 10 - ; CHECK-NEXT: dead [[STATEPOINT]]:gr64 = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 33, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 9, 2, 2, 2, 26, 2, 1, 2, 3, 2, 1, 2, 0, 2, 0, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 2, [[STATEPOINT]](tied-def 0), 2, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: dead [[STATEPOINT]]:gr64_with_sub_8bit = STATEPOINT 2882400000, 0, 1, target-flags(x86-plt) @ham, $edi, 2, 0, 2, 2, 2, 33, 2, 0, 2, 10, 2, 0, 2, 10, 2, 0, 2, 4, 2, 1, 2, 7, 2, 0, 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 9, 2, 2, 2, 26, 2, 1, 2, 3, 2, 1, 2, 0, 2, 0, 2, 0, [[STATEPOINT]], 2, 0, [[STATEPOINT]], 2, 7, 2, 0, 2, 0, [[STATEPOINT]], 2, 2, [[STATEPOINT]](tied-def 0), 2, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp bb.0.bb: successors: %bb.1(0x80000000), %bb.12(0x00000000) diff --git a/llvm/test/CodeGen/X86/statepoint-live-in-remat.ll b/llvm/test/CodeGen/X86/statepoint-live-in-remat.ll --- a/llvm/test/CodeGen/X86/statepoint-live-in-remat.ll +++ b/llvm/test/CodeGen/X86/statepoint-live-in-remat.ll @@ -31,18 +31,14 @@ ; CHECK-NEXT: .cfi_offset %r14, -32 ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %r9d, %r14d -; CHECK-NEXT: movl %r8d, %r15d -; CHECK-NEXT: movl %ecx, %r12d -; CHECK-NEXT: movl %edx, %r13d -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movl %r9d, %ebx +; CHECK-NEXT: movl %r8d, %ebp +; CHECK-NEXT: movl %ecx, %r14d +; CHECK-NEXT: movl %edx, %r15d +; CHECK-NEXT: movl %esi, %r12d +; CHECK-NEXT: movl %edi, %r13d ; CHECK-NEXT: movabsq $_bar, %rax ; CHECK-NEXT: callq *%rax -; CHECK-NEXT: movl %ebp, %eax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl %ebx, %eax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl %r13d, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl %r12d, %eax @@ -51,6 +47,10 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl %r14d, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: movl %ebx, %eax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax @@ -63,9 +63,9 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r12d -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r14d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r11d @@ -77,8 +77,8 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movabsq $_bar, %r13 -; CHECK-NEXT: callq *%r13 ## 96-byte Folded Reload +; CHECK-NEXT: movabsq $_bar, %r15 +; CHECK-NEXT: callq *%r15 ## 96-byte Folded Reload ; CHECK-NEXT: Ltmp0: ; CHECK-NEXT: addq $104, %rsp ; CHECK-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/statepoint-live-in.ll b/llvm/test/CodeGen/X86/statepoint-live-in.ll --- a/llvm/test/CodeGen/X86/statepoint-live-in.ll +++ b/llvm/test/CodeGen/X86/statepoint-live-in.ll @@ -372,12 +372,12 @@ ; CHECK-NEXT: .cfi_offset %r14, -32 ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %r9d, %r15d -; CHECK-NEXT: movl %r8d, %r14d -; CHECK-NEXT: movl %ecx, %r12d -; CHECK-NEXT: movl %edx, %r13d -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movl %r9d, %ebp +; CHECK-NEXT: movl %r8d, %ebx +; CHECK-NEXT: movl %ecx, %r14d +; CHECK-NEXT: movl %edx, %r15d +; CHECK-NEXT: movl %esi, %r12d +; CHECK-NEXT: movl %edi, %r13d ; CHECK-NEXT: callq _bar ; CHECK-NEXT: Ltmp11: ; CHECK-NEXT: callq _bar diff --git a/llvm/test/CodeGen/X86/statepoint-ra-no-ls.ll b/llvm/test/CodeGen/X86/statepoint-ra-no-ls.ll --- a/llvm/test/CodeGen/X86/statepoint-ra-no-ls.ll +++ b/llvm/test/CodeGen/X86/statepoint-ra-no-ls.ll @@ -30,28 +30,28 @@ ; CHECK-NEXT: .cfi_offset %r14, -32 ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq (%rdi), %r14 +; CHECK-NEXT: movq (%rdi), %rbx ; CHECK-NEXT: movq 8(%rdi), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq 16(%rdi), %r15 -; CHECK-NEXT: movq 24(%rdi), %r12 -; CHECK-NEXT: movq 32(%rdi), %r13 -; CHECK-NEXT: movq 40(%rdi), %rbx +; CHECK-NEXT: movq 16(%rdi), %r14 +; CHECK-NEXT: movq 24(%rdi), %r15 +; CHECK-NEXT: movq 32(%rdi), %r12 +; CHECK-NEXT: movq 40(%rdi), %r13 ; CHECK-NEXT: movq 48(%rdi), %rbp -; CHECK-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: callq foo@PLT # 8-byte Folded Reload ; CHECK-NEXT: .Ltmp0: -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; CHECK-NEXT: movq %rbp, %rdi ; CHECK-NEXT: callq bar@PLT -; CHECK-NEXT: movq %rbx, %rdi -; CHECK-NEXT: callq bar@PLT ; CHECK-NEXT: movq %r13, %rdi ; CHECK-NEXT: callq bar@PLT ; CHECK-NEXT: movq %r12, %rdi ; CHECK-NEXT: callq bar@PLT ; CHECK-NEXT: movq %r15, %rdi ; CHECK-NEXT: callq bar@PLT +; CHECK-NEXT: movq %r14, %rdi +; CHECK-NEXT: callq bar@PLT ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; CHECK-NEXT: callq bar@PLT ; CHECK-NEXT: addq $24, %rsp diff --git a/llvm/test/CodeGen/X86/statepoint-regs.ll b/llvm/test/CodeGen/X86/statepoint-regs.ll --- a/llvm/test/CodeGen/X86/statepoint-regs.ll +++ b/llvm/test/CodeGen/X86/statepoint-regs.ll @@ -75,12 +75,12 @@ ; CHECK-NEXT: .cfi_offset %r14, -32 ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %r9d, %r14d -; CHECK-NEXT: movl %r8d, %r15d -; CHECK-NEXT: movl %ecx, %r12d -; CHECK-NEXT: movl %edx, %r13d -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movl %r9d, %ebx +; CHECK-NEXT: movl %r8d, %ebp +; CHECK-NEXT: movl %ecx, %r14d +; CHECK-NEXT: movl %edx, %r15d +; CHECK-NEXT: movl %esi, %r12d +; CHECK-NEXT: movl %edi, %r13d ; CHECK-NEXT: callq _bar ; CHECK-NEXT: Ltmp3: ; CHECK-NEXT: addq $8, %rsp @@ -123,12 +123,12 @@ ; CHECK-NEXT: .cfi_offset %r14, -32 ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %r9d, %r14d -; CHECK-NEXT: movl %r8d, %r15d -; CHECK-NEXT: movl %ecx, %r12d -; CHECK-NEXT: movl %edx, %r13d -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movl %r9d, %ebx +; CHECK-NEXT: movl %r8d, %ebp +; CHECK-NEXT: movl %ecx, %r14d +; CHECK-NEXT: movl %edx, %r15d +; CHECK-NEXT: movl %esi, %r12d +; CHECK-NEXT: movl %edi, %r13d ; CHECK-NEXT: callq _bar ; CHECK-NEXT: Ltmp4: ; CHECK-NEXT: addq $8, %rsp @@ -234,12 +234,12 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl %edi, %r13d -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %edi, %r12d +; CHECK-NEXT: movl %esi, %r13d ; CHECK-NEXT: movl %edx, %ebp -; CHECK-NEXT: movl %ecx, %r14d -; CHECK-NEXT: movl %r8d, %r15d -; CHECK-NEXT: movl %r9d, %r12d +; CHECK-NEXT: movl %ecx, %ebx +; CHECK-NEXT: movl %r8d, %r14d +; CHECK-NEXT: movl %r9d, %r15d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax @@ -330,10 +330,10 @@ ; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl %r8d, (%rsp) ## 4-byte Spill -; CHECK-NEXT: movl %ecx, %r12d -; CHECK-NEXT: movl %edx, %r13d -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movl %ecx, %r14d +; CHECK-NEXT: movl %edx, %r15d +; CHECK-NEXT: movl %esi, %r12d +; CHECK-NEXT: movl %edi, %r13d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax @@ -366,8 +366,8 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r14d -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebp ; CHECK-NEXT: callq _bar ## 132-byte Folded Reload ; CHECK-NEXT: Ltmp10: ; CHECK-NEXT: addq $136, %rsp @@ -434,12 +434,12 @@ ; CHECK-NEXT: .cfi_offset %r14, -32 ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %r9d, %r14d -; CHECK-NEXT: movl %r8d, %r15d -; CHECK-NEXT: movl %ecx, %r12d -; CHECK-NEXT: movl %edx, %r13d -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movl %r9d, %ebx +; CHECK-NEXT: movl %r8d, %ebp +; CHECK-NEXT: movl %ecx, %r14d +; CHECK-NEXT: movl %edx, %r15d +; CHECK-NEXT: movl %esi, %r12d +; CHECK-NEXT: movl %edi, %r13d ; CHECK-NEXT: callq _bar ; CHECK-NEXT: Ltmp11: ; CHECK-NEXT: addq $8, %rsp @@ -484,12 +484,12 @@ ; CHECK-NEXT: .cfi_offset %r14, -32 ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %r9d, %r15d -; CHECK-NEXT: movl %r8d, %r14d -; CHECK-NEXT: movl %ecx, %r12d -; CHECK-NEXT: movl %edx, %r13d -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movl %r9d, %ebp +; CHECK-NEXT: movl %r8d, %ebx +; CHECK-NEXT: movl %ecx, %r14d +; CHECK-NEXT: movl %edx, %r15d +; CHECK-NEXT: movl %esi, %r12d +; CHECK-NEXT: movl %edi, %r13d ; CHECK-NEXT: callq _bar ; CHECK-NEXT: Ltmp12: ; CHECK-NEXT: callq _bar diff --git a/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll b/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll --- a/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll +++ b/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll @@ -15,14 +15,14 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: .cfi_offset %rbx, -16 ; CHECK-NEXT: movl %edi, %ebx -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-NEXT: movw %di, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %r11, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %r10, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movw %r11w, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rdi, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: andb $3, %sil ; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/statepoint-stack-usage.ll b/llvm/test/CodeGen/X86/statepoint-stack-usage.ll --- a/llvm/test/CodeGen/X86/statepoint-stack-usage.ll +++ b/llvm/test/CodeGen/X86/statepoint-stack-usage.ll @@ -65,17 +65,17 @@ ; CHECK-DAG: movl %esi, 8(%rsp) ; CHECK-DAG: movl %edx, 4(%rsp) ; CHECK: callq -; CHECK-DAG: movl %ebx, 12(%rsp) +; CHECK-DAG: movl %r14d, 12(%rsp) ; CHECK-DAG: movl %ebp, 8(%rsp) -; CHECK-DAG: movl %r14d, 4(%rsp) +; CHECK-DAG: movl %ebx, 4(%rsp) ; CHECK: callq -; CHECK-DAG: movl %ebx, 12(%rsp) +; CHECK-DAG: movl %r14d, 12(%rsp) ; CHECK-DAG: movl %ebp, 8(%rsp) -; CHECK-DAG: movl %r14d, 4(%rsp) +; CHECK-DAG: movl %ebx, 4(%rsp) ; CHECK: callq -; CHECK-DAG: movl %ebx, 12(%rsp) +; CHECK-DAG: movl %r14d, 12(%rsp) ; CHECK-DAG: movl %ebp, 8(%rsp) -; CHECK-DAG: movl %r14d, 4(%rsp) +; CHECK-DAG: movl %ebx, 4(%rsp) ; CHECK: callq call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) undef, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c)] call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) undef, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c)] diff --git a/llvm/test/CodeGen/X86/statepoint-vreg-details.ll b/llvm/test/CodeGen/X86/statepoint-vreg-details.ll --- a/llvm/test/CodeGen/X86/statepoint-vreg-details.ll +++ b/llvm/test/CodeGen/X86/statepoint-vreg-details.ll @@ -59,15 +59,15 @@ ; CHECK-VREG: CALL64pcrel32 @consume5, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $rsi, implicit $rdx, implicit $rcx, implicit $r8, implicit-def $rsp, implicit-def $ssp ; CHECK-PREG-LABEL: name: test_mixed -; CHECK-PREG: renamable $r14 = COPY $rdx -; CHECK-PREG: renamable $r15 = COPY $rsi -; CHECK-PREG: renamable $rbx = COPY $rdi -; CHECK-PREG: renamable $r14, renamable $r15, renamable $rbx = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 4, killed renamable $r14(tied-def 0), 2, 0, killed renamable $r15(tied-def 1), killed renamable $rbx(tied-def 2), 2, 0, 2, 4, 0, 0, 1, 1, 2, 2, 3, 3, csr_64, implicit-def $rsp, implicit-def $ssp -; CHECK-PREG: $rdi = COPY killed renamable $rbx +; CHECK-PREG: renamable $rbx = COPY $rdx +; CHECK-PREG: renamable $r14 = COPY $rsi +; CHECK-PREG: renamable $r15 = COPY $rdi +; CHECK-PREG: renamable $rbx, renamable $r14, renamable $r15 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 4, killed renamable $rbx(tied-def 0), 2, 0, killed renamable $r14(tied-def 1), killed renamable $r15(tied-def 2), 2, 0, 2, 4, 0, 0, 1, 1, 2, 2, 3, 3, csr_64, implicit-def $rsp, implicit-def $ssp +; CHECK-PREG: $rdi = COPY killed renamable $r15 ; CHECK-PREG: dead $esi = MOV32r0 implicit-def dead $eflags, implicit-def $rsi -; CHECK-PREG: $rdx = COPY killed renamable $r15 +; CHECK-PREG: $rdx = COPY killed renamable $r14 ; CHECK-PREG: dead $ecx = MOV32r0 implicit-def dead $eflags, implicit-def $rcx -; CHECK-PREG: $r8 = COPY killed renamable $r14 +; CHECK-PREG: $r8 = COPY killed renamable $rbx ; CHECK-PREG: CALL64pcrel32 @consume5, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $rsi, implicit $rdx, implicit killed $rcx, implicit killed $r8, implicit-def $rsp, implicit-def $ssp entry: diff --git a/llvm/test/CodeGen/X86/statepoint-vreg-invoke.ll b/llvm/test/CodeGen/X86/statepoint-vreg-invoke.ll --- a/llvm/test/CodeGen/X86/statepoint-vreg-invoke.ll +++ b/llvm/test/CodeGen/X86/statepoint-vreg-invoke.ll @@ -66,58 +66,58 @@ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; CHECK-NEXT: liveins: $edi, $rcx, $rdx, $rsi ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $rbx = COPY $rcx - ; CHECK-NEXT: renamable $rbp = COPY $rdx - ; CHECK-NEXT: renamable $r14d = COPY $edi - ; CHECK-NEXT: TEST8ri renamable $r14b, 1, implicit-def $eflags + ; CHECK-NEXT: renamable $r14 = COPY $rcx + ; CHECK-NEXT: renamable $r15 = COPY $rdx + ; CHECK-NEXT: renamable $ebx = COPY $edi + ; CHECK-NEXT: TEST8ri renamable $bl, 1, implicit-def $eflags ; CHECK-NEXT: JCC_1 %bb.3, 4, implicit killed $eflags ; CHECK-NEXT: JMP_1 %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.left: ; CHECK-NEXT: successors: %bb.2(0x7ffff800), %bb.6(0x00000800) - ; CHECK-NEXT: liveins: $rbp, $rsi, $r14d + ; CHECK-NEXT: liveins: $ebx, $rsi, $r15 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: MOV64mr %stack.0, 1, $noreg, 0, $noreg, renamable $rsi :: (store (s64) into %stack.0) ; CHECK-NEXT: EH_LABEL ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: $rdi = COPY killed renamable $rsi - ; CHECK-NEXT: renamable $rbp = STATEPOINT 0, 0, 1, @some_call, $rdi, 2, 0, 2, 0, 2, 0, 2, 2, killed renamable $rbp(tied-def 0), 1, 8, %stack.0, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s64) on %stack.0) + ; CHECK-NEXT: renamable $r15 = STATEPOINT 0, 0, 1, @some_call, $rdi, 2, 0, 2, 0, 2, 0, 2, 2, killed renamable $r15(tied-def 0), 1, 8, %stack.0, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s64) on %stack.0) ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: EH_LABEL ; CHECK-NEXT: JMP_1 %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.left.relocs: ; CHECK-NEXT: successors: %bb.5(0x80000000) - ; CHECK-NEXT: liveins: $rbp, $r14d + ; CHECK-NEXT: liveins: $ebx, $r15 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $rbx = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0) + ; CHECK-NEXT: renamable $r14 = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0) ; CHECK-NEXT: JMP_1 %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.right: ; CHECK-NEXT: successors: %bb.4(0x7ffff800), %bb.7(0x00000800) - ; CHECK-NEXT: liveins: $rbp, $rbx, $rsi, $r14d + ; CHECK-NEXT: liveins: $ebx, $rsi, $r14, $r15 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $rbp :: (store (s64) into %stack.0) + ; CHECK-NEXT: MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $r15 :: (store (s64) into %stack.0) ; CHECK-NEXT: EH_LABEL ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: $rdi = COPY killed renamable $rsi - ; CHECK-NEXT: renamable $rbx = STATEPOINT 0, 0, 1, @some_call, $rdi, 2, 0, 2, 0, 2, 0, 2, 2, killed renamable $rbx(tied-def 0), 1, 8, %stack.0, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s64) on %stack.0) + ; CHECK-NEXT: renamable $r14 = STATEPOINT 0, 0, 1, @some_call, $rdi, 2, 0, 2, 0, 2, 0, 2, 2, killed renamable $r14(tied-def 0), 1, 8, %stack.0, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (volatile load store (s64) on %stack.0) ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-NEXT: EH_LABEL ; CHECK-NEXT: JMP_1 %bb.4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4.right.relocs: ; CHECK-NEXT: successors: %bb.5(0x80000000) - ; CHECK-NEXT: liveins: $rbx, $r14d + ; CHECK-NEXT: liveins: $ebx, $r14 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $rbp = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0) + ; CHECK-NEXT: renamable $r15 = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5.normal_return: - ; CHECK-NEXT: liveins: $rbp, $rbx, $r14d + ; CHECK-NEXT: liveins: $ebx, $r14, $r15 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: TEST8ri renamable $r14b, 1, implicit-def $eflags, implicit killed $r14d - ; CHECK-NEXT: renamable $rbx = CMOV64rr killed renamable $rbx, killed renamable $rbp, 4, implicit killed $eflags - ; CHECK-NEXT: $rax = COPY killed renamable $rbx + ; CHECK-NEXT: TEST8ri renamable $bl, 1, implicit-def $eflags, implicit killed $ebx + ; CHECK-NEXT: renamable $r14 = CMOV64rr killed renamable $r14, killed renamable $r15, 4, implicit killed $eflags + ; CHECK-NEXT: $rax = COPY killed renamable $r14 ; CHECK-NEXT: RET 0, $rax ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6.exceptional_return.left (landing-pad): diff --git a/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll b/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll --- a/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll +++ b/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll @@ -77,13 +77,13 @@ ; CHECK-PREG-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.1) ; CHECK-PREG-NEXT: renamable $rax = MOV64rm %fixed-stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.5, align 16) ; CHECK-PREG-NEXT: MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.0) - ; CHECK-PREG-NEXT: renamable $rbx = MOV64rm %fixed-stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.4) - ; CHECK-PREG-NEXT: renamable $r13 = MOV64rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16) - ; CHECK-PREG-NEXT: renamable $r12 = MOV64rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2) - ; CHECK-PREG-NEXT: renamable $r14 = MOV64rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.1, align 16) - ; CHECK-PREG-NEXT: renamable $r15 = MOV64rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.0) + ; CHECK-PREG-NEXT: renamable $r13 = MOV64rm %fixed-stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.4) + ; CHECK-PREG-NEXT: renamable $r12 = MOV64rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16) + ; CHECK-PREG-NEXT: renamable $r15 = MOV64rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2) + ; CHECK-PREG-NEXT: renamable $rbx = MOV64rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.1, align 16) + ; CHECK-PREG-NEXT: renamable $r14 = MOV64rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.0) ; CHECK-PREG-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp - ; CHECK-PREG-NEXT: renamable $r15, renamable $r14, renamable $r12, renamable $r13, renamable $rbx, renamable $rbp = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, killed renamable $r15(tied-def 0), killed renamable $r14(tied-def 1), killed renamable $r12(tied-def 2), killed renamable $r13(tied-def 3), killed renamable $rbx(tied-def 4), 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 1, 8, %stack.3, 0, 1, 8, %stack.4, 0, 1, 8, %stack.5, 0, 1, 8, %stack.7, 0, 1, 8, %stack.8, 0, 1, 8, %stack.2, 0, 1, 8, %stack.6, 0, 1, 8, %stack.9, 0, 1, 8, %stack.10, 0, 1, 8, %stack.11, 0, killed renamable $rbp(tied-def 5), 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store (s64) on %stack.0), (load store (s64) on %stack.1), (load store (s64) on %stack.2), (load store (s64) on %stack.3), (load store (s64) on %stack.4), (load store (s64) on %stack.5), (load store (s64) on %stack.6), (load store (s64) on %stack.7), (load store (s64) on %stack.8), (load store (s64) on %stack.9), (load store (s64) on %stack.10), (load store (s64) on %stack.11) + ; CHECK-PREG-NEXT: renamable $r14, renamable $rbx, renamable $r15, renamable $r12, renamable $r13, renamable $rbp = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, killed renamable $r14(tied-def 0), killed renamable $rbx(tied-def 1), killed renamable $r15(tied-def 2), killed renamable $r12(tied-def 3), killed renamable $r13(tied-def 4), 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 1, 8, %stack.3, 0, 1, 8, %stack.4, 0, 1, 8, %stack.5, 0, 1, 8, %stack.7, 0, 1, 8, %stack.8, 0, 1, 8, %stack.2, 0, 1, 8, %stack.6, 0, 1, 8, %stack.9, 0, 1, 8, %stack.10, 0, 1, 8, %stack.11, 0, killed renamable $rbp(tied-def 5), 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store (s64) on %stack.0), (load store (s64) on %stack.1), (load store (s64) on %stack.2), (load store (s64) on %stack.3), (load store (s64) on %stack.4), (load store (s64) on %stack.5), (load store (s64) on %stack.6), (load store (s64) on %stack.7), (load store (s64) on %stack.8), (load store (s64) on %stack.9), (load store (s64) on %stack.10), (load store (s64) on %stack.11) ; CHECK-PREG-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK-PREG-NEXT: renamable $eax = MOV32rm killed renamable $rbp, 1, $noreg, 4, $noreg :: (load (s32) from %ir.gep00, addrspace 1) ; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11) @@ -110,11 +110,11 @@ ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1) ; CHECK-PREG-NEXT: renamable $rdi = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0) ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rbx, 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r13, 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r12, 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r14, 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1) - ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r15, 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r13, 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r12, 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r15, 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rbx, 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1) + ; CHECK-PREG-NEXT: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r14, 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1) ; CHECK-PREG-NEXT: RET 0, $eax ptr addrspace(1) %arg00, ptr addrspace(1) %arg01, ptr addrspace(1) %arg02, ptr addrspace(1) %arg03, ptr addrspace(1) %arg04, ptr addrspace(1) %arg05, ptr addrspace(1) %arg06, ptr addrspace(1) %arg07, ptr addrspace(1) %arg08, ptr addrspace(1) %arg09, ptr addrspace(1) %arg10, ptr addrspace(1) %arg11, diff --git a/llvm/test/CodeGen/X86/statepoint-vreg.ll b/llvm/test/CodeGen/X86/statepoint-vreg.ll --- a/llvm/test/CodeGen/X86/statepoint-vreg.ll +++ b/llvm/test/CodeGen/X86/statepoint-vreg.ll @@ -62,16 +62,16 @@ ; CHECK-NEXT: .cfi_offset %rbx, -32 ; CHECK-NEXT: .cfi_offset %r14, -24 ; CHECK-NEXT: .cfi_offset %r15, -16 -; CHECK-NEXT: movq %rdx, %r14 -; CHECK-NEXT: movq %rsi, %r15 -; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: movq %rsi, %r14 +; CHECK-NEXT: movq %rdi, %r15 ; CHECK-NEXT: callq func@PLT ; CHECK-NEXT: .Ltmp1: -; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: movq %r15, %rdi ; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: movq %r15, %rdx +; CHECK-NEXT: movq %r14, %rdx ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movq %r14, %r8 +; CHECK-NEXT: movq %rbx, %r8 ; CHECK-NEXT: callq consume5@PLT ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 24 @@ -251,17 +251,17 @@ ; CHECK-NEXT: .cfi_offset %rbx, -32 ; CHECK-NEXT: .cfi_offset %r14, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %esi, %ebp +; CHECK-NEXT: movl %esi, %r14d ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: callq return_i1@PLT ; CHECK-NEXT: .Ltmp7: -; CHECK-NEXT: testb $1, %bpl +; CHECK-NEXT: testb $1, %r14b ; CHECK-NEXT: je .LBB7_2 ; CHECK-NEXT: # %bb.1: # %left -; CHECK-NEXT: movl %eax, %r14d +; CHECK-NEXT: movl %eax, %ebp ; CHECK-NEXT: movq %rbx, %rdi ; CHECK-NEXT: callq consume@PLT -; CHECK-NEXT: movl %r14d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: jmp .LBB7_3 ; CHECK-NEXT: .LBB7_2: # %right ; CHECK-NEXT: movb $1, %al @@ -353,18 +353,18 @@ ; CHECK-NEXT: .cfi_offset %r12, -32 ; CHECK-NEXT: .cfi_offset %r14, -24 ; CHECK-NEXT: .cfi_offset %r15, -16 -; CHECK-NEXT: movq %r8, %r14 -; CHECK-NEXT: movq %rcx, %r15 -; CHECK-NEXT: movq %rdx, %r12 -; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: movq %r8, %rbx +; CHECK-NEXT: movq %rcx, %r14 +; CHECK-NEXT: movq %rdx, %r15 +; CHECK-NEXT: movq %rsi, %r12 ; CHECK-NEXT: movq %rdi, (%rsp) ; CHECK-NEXT: callq func@PLT ; CHECK-NEXT: .Ltmp11: ; CHECK-NEXT: movq (%rsp), %rdi -; CHECK-NEXT: movq %rbx, %rsi -; CHECK-NEXT: movq %r12, %rdx -; CHECK-NEXT: movq %r15, %rcx -; CHECK-NEXT: movq %r14, %r8 +; CHECK-NEXT: movq %r12, %rsi +; CHECK-NEXT: movq %r15, %rdx +; CHECK-NEXT: movq %r14, %rcx +; CHECK-NEXT: movq %rbx, %r8 ; CHECK-NEXT: callq consume5@PLT ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 40 diff --git a/llvm/test/CodeGen/X86/statepoint-vreg.mir b/llvm/test/CodeGen/X86/statepoint-vreg.mir --- a/llvm/test/CodeGen/X86/statepoint-vreg.mir +++ b/llvm/test/CodeGen/X86/statepoint-vreg.mir @@ -20,12 +20,12 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: .cfi_offset %rbx, -24 ; CHECK-NEXT: .cfi_offset %r14, -16 - ; CHECK-NEXT: movq %rsi, %r14 - ; CHECK-NEXT: movq %rdi, %rbx + ; CHECK-NEXT: movq %rsi, %rbx + ; CHECK-NEXT: movq %rdi, %r14 ; CHECK-NEXT: callq bar ; CHECK-NEXT: .Ltmp0: - ; CHECK-NEXT: movl (%rbx), %eax - ; CHECK-NEXT: addl (%r14), %eax + ; CHECK-NEXT: movl (%r14), %eax + ; CHECK-NEXT: addl (%rbx), %eax ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: popq %rbx @@ -83,25 +83,25 @@ ; CHECK-NEXT: .byte 1 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .short 8 - ; CHECK-NEXT: .short 14 + ; CHECK-NEXT: .short 3 ; CHECK-NEXT: .short 0 ; CHECK-NEXT: .long 0 ; CHECK-NEXT: .byte 1 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .short 8 - ; CHECK-NEXT: .short 14 + ; CHECK-NEXT: .short 3 ; CHECK-NEXT: .short 0 ; CHECK-NEXT: .long 0 ; CHECK-NEXT: .byte 1 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .short 8 - ; CHECK-NEXT: .short 3 + ; CHECK-NEXT: .short 14 ; CHECK-NEXT: .short 0 ; CHECK-NEXT: .long 0 ; CHECK-NEXT: .byte 1 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .short 8 - ; CHECK-NEXT: .short 3 + ; CHECK-NEXT: .short 14 ; CHECK-NEXT: .short 0 ; CHECK-NEXT: .long 0 ; CHECK-NEXT: .p2align 3 diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll --- a/llvm/test/CodeGen/X86/subcarry.ll +++ b/llvm/test/CodeGen/X86/subcarry.ll @@ -41,18 +41,18 @@ ; CHECK-LABEL: negate: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: xorl %r8d, %r8d +; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: subq (%rsi), %rdx ; CHECK-NEXT: movl $0, %edi ; CHECK-NEXT: sbbq 8(%rsi), %rdi -; CHECK-NEXT: movl $0, %ecx -; CHECK-NEXT: sbbq 16(%rsi), %rcx -; CHECK-NEXT: sbbq 24(%rsi), %r8 +; CHECK-NEXT: movl $0, %r8d +; CHECK-NEXT: sbbq 16(%rsi), %r8 +; CHECK-NEXT: sbbq 24(%rsi), %rcx ; CHECK-NEXT: movq %rdx, (%rax) ; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq %rcx, 16(%rax) -; CHECK-NEXT: movq %r8, 24(%rax) +; CHECK-NEXT: movq %r8, 16(%rax) +; CHECK-NEXT: movq %rcx, 24(%rax) ; CHECK-NEXT: retq entry: %0 = load i64, ptr %this, align 8 @@ -93,25 +93,25 @@ ; CHECK-LABEL: sub: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq (%rsi), %r10 -; CHECK-NEXT: movq 8(%rsi), %rdi -; CHECK-NEXT: subq %rdx, %r10 +; CHECK-NEXT: movq (%rsi), %rdi +; CHECK-NEXT: movq 8(%rsi), %r10 +; CHECK-NEXT: subq %rdx, %rdi ; CHECK-NEXT: setae %dl ; CHECK-NEXT: addb $-1, %dl -; CHECK-NEXT: adcq $0, %rdi -; CHECK-NEXT: setb %dl -; CHECK-NEXT: movzbl %dl, %r11d -; CHECK-NEXT: notq %rcx -; CHECK-NEXT: addq %rdi, %rcx -; CHECK-NEXT: adcq 16(%rsi), %r11 +; CHECK-NEXT: adcq $0, %r10 ; CHECK-NEXT: setb %dl ; CHECK-NEXT: movzbl %dl, %edx +; CHECK-NEXT: notq %rcx +; CHECK-NEXT: addq %r10, %rcx +; CHECK-NEXT: adcq 16(%rsi), %rdx +; CHECK-NEXT: setb %r10b +; CHECK-NEXT: movzbl %r10b, %r10d ; CHECK-NEXT: notq %r8 -; CHECK-NEXT: addq %r11, %r8 -; CHECK-NEXT: adcq 24(%rsi), %rdx +; CHECK-NEXT: addq %rdx, %r8 +; CHECK-NEXT: adcq 24(%rsi), %r10 ; CHECK-NEXT: notq %r9 -; CHECK-NEXT: addq %rdx, %r9 -; CHECK-NEXT: movq %r10, (%rax) +; CHECK-NEXT: addq %r10, %r9 +; CHECK-NEXT: movq %rdi, (%rax) ; CHECK-NEXT: movq %rcx, 8(%rax) ; CHECK-NEXT: movq %r8, 16(%rax) ; CHECK-NEXT: movq %r9, 24(%rax) @@ -593,21 +593,21 @@ ; CHECK-LABEL: sub_U256_without_i128_or_recursive: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq (%rsi), %r8 -; CHECK-NEXT: movq 8(%rsi), %r9 -; CHECK-NEXT: movq 16(%rsi), %rcx +; CHECK-NEXT: movq (%rsi), %rcx +; CHECK-NEXT: movq 8(%rsi), %rdi +; CHECK-NEXT: movq 16(%rsi), %r8 ; CHECK-NEXT: movq 24(%rsi), %rsi -; CHECK-NEXT: xorl %edi, %edi -; CHECK-NEXT: subq 16(%rdx), %rcx -; CHECK-NEXT: setb %dil +; CHECK-NEXT: xorl %r9d, %r9d +; CHECK-NEXT: subq 16(%rdx), %r8 +; CHECK-NEXT: setb %r9b ; CHECK-NEXT: subq 24(%rdx), %rsi -; CHECK-NEXT: subq (%rdx), %r8 -; CHECK-NEXT: sbbq 8(%rdx), %r9 -; CHECK-NEXT: sbbq $0, %rcx -; CHECK-NEXT: sbbq %rdi, %rsi -; CHECK-NEXT: movq %r8, (%rax) -; CHECK-NEXT: movq %r9, 8(%rax) -; CHECK-NEXT: movq %rcx, 16(%rax) +; CHECK-NEXT: subq (%rdx), %rcx +; CHECK-NEXT: sbbq 8(%rdx), %rdi +; CHECK-NEXT: sbbq $0, %r8 +; CHECK-NEXT: sbbq %r9, %rsi +; CHECK-NEXT: movq %rcx, (%rax) +; CHECK-NEXT: movq %rdi, 8(%rax) +; CHECK-NEXT: movq %r8, 16(%rax) ; CHECK-NEXT: movq %rsi, 24(%rax) ; CHECK-NEXT: retq %4 = load i64, ptr %1, align 8 diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll --- a/llvm/test/CodeGen/X86/swifterror.ll +++ b/llvm/test/CodeGen/X86/swifterror.ll @@ -530,25 +530,25 @@ define void @foo_sret(ptr sret(%struct.S) %agg.result, i32 %val1, ptr swifterror %error_ptr_ref) { ; CHECK-APPLE-LABEL: foo_sret: ; CHECK-APPLE: ## %bb.0: ## %entry -; CHECK-APPLE-NEXT: pushq %rbp +; CHECK-APPLE-NEXT: pushq %r14 ; CHECK-APPLE-NEXT: .cfi_def_cfa_offset 16 ; CHECK-APPLE-NEXT: pushq %rbx ; CHECK-APPLE-NEXT: .cfi_def_cfa_offset 24 ; CHECK-APPLE-NEXT: pushq %rax ; CHECK-APPLE-NEXT: .cfi_def_cfa_offset 32 ; CHECK-APPLE-NEXT: .cfi_offset %rbx, -24 -; CHECK-APPLE-NEXT: .cfi_offset %rbp, -16 -; CHECK-APPLE-NEXT: movl %esi, %ebp -; CHECK-APPLE-NEXT: movq %rdi, %rbx +; CHECK-APPLE-NEXT: .cfi_offset %r14, -16 +; CHECK-APPLE-NEXT: movl %esi, %ebx +; CHECK-APPLE-NEXT: movq %rdi, %r14 ; CHECK-APPLE-NEXT: movl $16, %edi ; CHECK-APPLE-NEXT: callq _malloc ; CHECK-APPLE-NEXT: movb $1, 8(%rax) -; CHECK-APPLE-NEXT: movl %ebp, 4(%rbx) +; CHECK-APPLE-NEXT: movl %ebx, 4(%r14) ; CHECK-APPLE-NEXT: movq %rax, %r12 -; CHECK-APPLE-NEXT: movq %rbx, %rax +; CHECK-APPLE-NEXT: movq %r14, %rax ; CHECK-APPLE-NEXT: addq $8, %rsp ; CHECK-APPLE-NEXT: popq %rbx -; CHECK-APPLE-NEXT: popq %rbp +; CHECK-APPLE-NEXT: popq %r14 ; CHECK-APPLE-NEXT: retq ; ; CHECK-O0-LABEL: foo_sret: @@ -736,8 +736,8 @@ ; CHECK-APPLE-NEXT: .cfi_offset %rbx, -40 ; CHECK-APPLE-NEXT: .cfi_offset %r12, -32 ; CHECK-APPLE-NEXT: .cfi_offset %r14, -24 -; CHECK-APPLE-NEXT: movq %rsi, %r14 -; CHECK-APPLE-NEXT: movq %rdi, %rbx +; CHECK-APPLE-NEXT: movq %rsi, %rbx +; CHECK-APPLE-NEXT: movq %rdi, %r14 ; CHECK-APPLE-NEXT: xorl %r12d, %r12d ; CHECK-APPLE-NEXT: callq _foo ; CHECK-APPLE-NEXT: movq %r12, %rdi @@ -745,7 +745,7 @@ ; CHECK-APPLE-NEXT: jne LBB7_2 ; CHECK-APPLE-NEXT: ## %bb.1: ## %cont ; CHECK-APPLE-NEXT: movzbl 8(%rdi), %eax -; CHECK-APPLE-NEXT: movb %al, (%rbx) +; CHECK-APPLE-NEXT: movb %al, (%r14) ; CHECK-APPLE-NEXT: LBB7_2: ## %handler ; CHECK-APPLE-NEXT: callq _free ; CHECK-APPLE-NEXT: movq %rsp, %rax @@ -758,7 +758,7 @@ ; CHECK-APPLE-NEXT: jne LBB7_4 ; CHECK-APPLE-NEXT: ## %bb.3: ## %cont2 ; CHECK-APPLE-NEXT: movzbl 8(%rdi), %eax -; CHECK-APPLE-NEXT: movb %al, (%r14) +; CHECK-APPLE-NEXT: movb %al, (%rbx) ; CHECK-APPLE-NEXT: LBB7_4: ## %handler2 ; CHECK-APPLE-NEXT: callq _free ; CHECK-APPLE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -1400,9 +1400,9 @@ ; CHECK-APPLE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-APPLE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-APPLE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-APPLE-NEXT: movq %rcx, %r14 -; CHECK-APPLE-NEXT: movq %rdx, %r15 -; CHECK-APPLE-NEXT: movq %rsi, %rbx +; CHECK-APPLE-NEXT: movq %rcx, %rbx +; CHECK-APPLE-NEXT: movq %rdx, %r14 +; CHECK-APPLE-NEXT: movq %rsi, %r15 ; CHECK-APPLE-NEXT: movq %rdi, %rbp ; CHECK-APPLE-NEXT: movl $1, %edi ; CHECK-APPLE-NEXT: movl $2, %esi @@ -1414,9 +1414,9 @@ ; CHECK-APPLE-NEXT: xorl %r12d, %r12d ; CHECK-APPLE-NEXT: callq _params_in_reg2 ; CHECK-APPLE-NEXT: movq %rbp, %rdi -; CHECK-APPLE-NEXT: movq %rbx, %rsi -; CHECK-APPLE-NEXT: movq %r15, %rdx -; CHECK-APPLE-NEXT: movq %r14, %rcx +; CHECK-APPLE-NEXT: movq %r15, %rsi +; CHECK-APPLE-NEXT: movq %r14, %rdx +; CHECK-APPLE-NEXT: movq %rbx, %rcx ; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload ; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload ; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 ## 8-byte Reload @@ -1566,13 +1566,13 @@ ; CHECK-APPLE-NEXT: .cfi_offset %r14, -32 ; CHECK-APPLE-NEXT: .cfi_offset %r15, -24 ; CHECK-APPLE-NEXT: .cfi_offset %rbp, -16 -; CHECK-APPLE-NEXT: movq %r12, %r14 +; CHECK-APPLE-NEXT: movq %r12, %rbx ; CHECK-APPLE-NEXT: movq %r13, (%rsp) ## 8-byte Spill ; CHECK-APPLE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-APPLE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-APPLE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-APPLE-NEXT: movq %rdx, %r15 -; CHECK-APPLE-NEXT: movq %rsi, %rbx +; CHECK-APPLE-NEXT: movq %rdx, %r14 +; CHECK-APPLE-NEXT: movq %rsi, %r15 ; CHECK-APPLE-NEXT: movq %rdi, %rbp ; CHECK-APPLE-NEXT: movl $1, %edi ; CHECK-APPLE-NEXT: movl $2, %esi @@ -1585,18 +1585,18 @@ ; CHECK-APPLE-NEXT: callq _params_in_reg2 ; CHECK-APPLE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-APPLE-NEXT: movq %rbp, %rdi -; CHECK-APPLE-NEXT: movq %rbx, %rsi -; CHECK-APPLE-NEXT: movq %r15, %rdx +; CHECK-APPLE-NEXT: movq %r15, %rsi +; CHECK-APPLE-NEXT: movq %r14, %rdx ; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload ; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload ; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload ; CHECK-APPLE-NEXT: movq (%rsp), %r13 ## 8-byte Reload -; CHECK-APPLE-NEXT: movq %r14, %r12 +; CHECK-APPLE-NEXT: movq %rbx, %r12 ; CHECK-APPLE-NEXT: callq _params_and_return_in_reg2 -; CHECK-APPLE-NEXT: movq %rax, %rbx -; CHECK-APPLE-NEXT: movq %rdx, %rbp -; CHECK-APPLE-NEXT: movq %rcx, %r15 -; CHECK-APPLE-NEXT: movq %r8, %r14 +; CHECK-APPLE-NEXT: movq %rax, %r14 +; CHECK-APPLE-NEXT: movq %rdx, %r15 +; CHECK-APPLE-NEXT: movq %rcx, %rbp +; CHECK-APPLE-NEXT: movq %r8, %rbx ; CHECK-APPLE-NEXT: movq %r12, (%rsp) ## 8-byte Spill ; CHECK-APPLE-NEXT: movl $1, %edi ; CHECK-APPLE-NEXT: movl $2, %esi @@ -1607,10 +1607,10 @@ ; CHECK-APPLE-NEXT: xorl %r13d, %r13d ; CHECK-APPLE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload ; CHECK-APPLE-NEXT: callq _params_in_reg2 -; CHECK-APPLE-NEXT: movq %rbx, %rax -; CHECK-APPLE-NEXT: movq %rbp, %rdx -; CHECK-APPLE-NEXT: movq %r15, %rcx -; CHECK-APPLE-NEXT: movq %r14, %r8 +; CHECK-APPLE-NEXT: movq %r14, %rax +; CHECK-APPLE-NEXT: movq %r15, %rdx +; CHECK-APPLE-NEXT: movq %rbp, %rcx +; CHECK-APPLE-NEXT: movq %rbx, %r8 ; CHECK-APPLE-NEXT: movq (%rsp), %r12 ## 8-byte Reload ; CHECK-APPLE-NEXT: addq $48, %rsp ; CHECK-APPLE-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll b/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll --- a/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll +++ b/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll @@ -100,32 +100,32 @@ ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB1_27 ; CHECK-NEXT: # %bb.1: # %if.end19 -; CHECK-NEXT: movl %esi, %r13d -; CHECK-NEXT: movq %rdi, %r12 -; CHECK-NEXT: movl (%rax), %ebp -; CHECK-NEXT: leal (,%rbp,4), %r14d -; CHECK-NEXT: movl %r14d, %r15d +; CHECK-NEXT: movl %esi, %ebp +; CHECK-NEXT: movq %rdi, %r15 +; CHECK-NEXT: movl (%rax), %r13d +; CHECK-NEXT: leal (,%r13,4), %ebx +; CHECK-NEXT: movl %ebx, %r12d ; CHECK-NEXT: movl $1, %esi -; CHECK-NEXT: movq %r15, %rdi +; CHECK-NEXT: movq %r12, %rdi ; CHECK-NEXT: callq cli_calloc@PLT -; CHECK-NEXT: testl %r13d, %r13d +; CHECK-NEXT: testl %ebp, %ebp ; CHECK-NEXT: je .LBB1_26 ; CHECK-NEXT: # %bb.2: # %if.end19 -; CHECK-NEXT: testl %ebp, %ebp +; CHECK-NEXT: testl %r13d, %r13d ; CHECK-NEXT: je .LBB1_26 ; CHECK-NEXT: # %bb.3: # %if.end19 -; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: movq %rax, %r14 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB1_26 ; CHECK-NEXT: # %bb.4: # %if.end19 -; CHECK-NEXT: cmpq %r12, %rbx +; CHECK-NEXT: cmpq %r15, %r14 ; CHECK-NEXT: jb .LBB1_26 ; CHECK-NEXT: # %bb.5: # %if.end50 -; CHECK-NEXT: movq %rbx, %rdi -; CHECK-NEXT: movq %r15, %rdx +; CHECK-NEXT: movq %r14, %rdi +; CHECK-NEXT: movq %r12, %rdx ; CHECK-NEXT: callq memcpy@PLT -; CHECK-NEXT: cmpl $4, %r14d +; CHECK-NEXT: cmpl $4, %ebx ; CHECK-NEXT: jb .LBB1_29 ; CHECK-NEXT: # %bb.6: # %shared_preheader ; CHECK-NEXT: movb $32, %dl @@ -146,13 +146,13 @@ ; CHECK-NEXT: .LBB1_9: # %outer_loop_header ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB1_10 Depth 2 -; CHECK-NEXT: testl %ebp, %ebp +; CHECK-NEXT: testl %r13d, %r13d ; CHECK-NEXT: je .LBB1_19 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_10: # %shared_loop_header ; CHECK-NEXT: # Parent Loop BB1_9 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-NEXT: testq %rbx, %rbx +; CHECK-NEXT: testq %r14, %r14 ; CHECK-NEXT: jne .LBB1_28 ; CHECK-NEXT: # %bb.11: # %inner_loop_body ; CHECK-NEXT: # in Loop: Header=BB1_10 Depth=2 @@ -160,12 +160,12 @@ ; CHECK-NEXT: jns .LBB1_10 ; CHECK-NEXT: # %bb.12: # %if.end96.i ; CHECK-NEXT: # in Loop: Header=BB1_9 Depth=1 -; CHECK-NEXT: cmpl $3, %ebp +; CHECK-NEXT: cmpl $3, %r13d ; CHECK-NEXT: jae .LBB1_23 ; CHECK-NEXT: # %bb.13: # %if.end287.i ; CHECK-NEXT: # in Loop: Header=BB1_9 Depth=1 ; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: cmpl $1, %ebp +; CHECK-NEXT: cmpl $1, %r13d ; CHECK-NEXT: setne %dl ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB1_17 diff --git a/llvm/test/CodeGen/X86/tail-opts.ll b/llvm/test/CodeGen/X86/tail-opts.ll --- a/llvm/test/CodeGen/X86/tail-opts.ll +++ b/llvm/test/CodeGen/X86/tail-opts.ll @@ -95,22 +95,22 @@ define dso_local void @tail_duplicate_me() nounwind { ; CHECK-LABEL: tail_duplicate_me: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: callq qux@PLT ; CHECK-NEXT: movl $.Ltmp0, %edi ; CHECK-NEXT: movl $.Ltmp1, %esi -; CHECK-NEXT: movl %eax, %ebx +; CHECK-NEXT: movl %eax, %ebp ; CHECK-NEXT: callq choose@PLT -; CHECK-NEXT: movq %rax, %r14 -; CHECK-NEXT: testb $1, %bl +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: testb $1, %bpl ; CHECK-NEXT: je .LBB1_1 ; CHECK-NEXT: # %bb.7: # %A ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: callq bar ; CHECK-NEXT: movl $0, GHJK(%rip) -; CHECK-NEXT: jmpq *%r14 +; CHECK-NEXT: jmpq *%rbx ; CHECK-NEXT: .Ltmp0: # Block address taken ; CHECK-NEXT: .LBB1_4: # %return ; CHECK-NEXT: movl $1000, %edi # imm = 0x3E8 @@ -124,7 +124,7 @@ ; CHECK-NEXT: movl $1, %edi ; CHECK-NEXT: callq car ; CHECK-NEXT: movl $0, GHJK(%rip) -; CHECK-NEXT: jmpq *%r14 +; CHECK-NEXT: jmpq *%rbx ; CHECK-NEXT: .Ltmp1: # Block address taken ; CHECK-NEXT: .LBB1_6: # %altret ; CHECK-NEXT: movl $1001, %edi # imm = 0x3E9 @@ -132,13 +132,13 @@ ; CHECK-NEXT: .LBB1_5: # %return ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB1_3: # %C ; CHECK-NEXT: movl $2, %edi ; CHECK-NEXT: callq dar ; CHECK-NEXT: movl $0, GHJK(%rip) -; CHECK-NEXT: jmpq *%r14 +; CHECK-NEXT: jmpq *%rbx entry: %a = call i1 @qux() %c = call ptr @choose(ptr blockaddress(@tail_duplicate_me, %return), diff --git a/llvm/test/CodeGen/X86/tailcallstack64.ll b/llvm/test/CodeGen/X86/tailcallstack64.ll --- a/llvm/test/CodeGen/X86/tailcallstack64.ll +++ b/llvm/test/CodeGen/X86/tailcallstack64.ll @@ -6,7 +6,7 @@ ; Check that lowered arguments on the stack do not overwrite each other. ; Add %in1 %p1 to a different temporary register (%eax). -; CHECK: movl [[A1:32|144]](%rsp), [[R1:%e..]] +; CHECK: movl [[A1:32|144]](%rsp), [[R1:%e..|%r.*d]] ; Move param %in1 to temp register (%r10d). ; CHECK: movl [[A2:40|152]](%rsp), [[R2:%[a-z0-9]+]] ; Add %in1 %p1 to a different temporary register (%eax). diff --git a/llvm/test/CodeGen/X86/tailccstack64.ll b/llvm/test/CodeGen/X86/tailccstack64.ll --- a/llvm/test/CodeGen/X86/tailccstack64.ll +++ b/llvm/test/CodeGen/X86/tailccstack64.ll @@ -6,7 +6,7 @@ ; Check that lowered arguments on the stack do not overwrite each other. ; Add %in1 %p1 to a different temporary register (%eax). -; CHECK: movl [[A1:32|144]](%rsp), [[R1:%e..]] +; CHECK: movl [[A1:32|144]](%rsp), [[R1:%e..|%r.*d]] ; Move param %in1 to temp register (%r10d). ; CHECK: movl [[A2:40|152]](%rsp), [[R2:%[a-z0-9]+]] ; Add %in1 %p1 to a different temporary register (%eax). diff --git a/llvm/test/CodeGen/X86/twoaddr-lea.ll b/llvm/test/CodeGen/X86/twoaddr-lea.ll --- a/llvm/test/CodeGen/X86/twoaddr-lea.ll +++ b/llvm/test/CodeGen/X86/twoaddr-lea.ll @@ -63,7 +63,7 @@ define void @ham() { ; CHECK-LABEL: ham: ; CHECK: ## %bb.0: ## %bb -; CHECK-NEXT: xorl %r8d, %r8d +; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: movq _global@GOTPCREL(%rip), %rdx ; CHECK-NEXT: movq _global2@GOTPCREL(%rip), %rsi ; CHECK-NEXT: xorl %eax, %eax @@ -74,16 +74,16 @@ ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB3_7 Depth 2 ; CHECK-NEXT: movl (%rdx), %edi -; CHECK-NEXT: leal (%rdi,%rax), %ecx -; CHECK-NEXT: movslq %ecx, %rcx +; CHECK-NEXT: leal (%rdi,%rax), %r8d +; CHECK-NEXT: movslq %r8d, %r8 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB3_7: ## %bb6 ; CHECK-NEXT: ## Parent Loop BB3_6 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 ; CHECK-NEXT: movq %rax, (%rsi) -; CHECK-NEXT: movq %rcx, (%rsi) +; CHECK-NEXT: movq %r8, (%rsi) ; CHECK-NEXT: movl %edi, (%rdx) -; CHECK-NEXT: testb %r8b, %r8b +; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: jne LBB3_7 ; CHECK-NEXT: ## %bb.8: ## %bb9 ; CHECK-NEXT: ## in Loop: Header=BB3_6 Depth=1 diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -1031,25 +1031,25 @@ ; SSE-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE-NEXT: pand %xmm11, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] ; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pxor %xmm8, %xmm9 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pxor %xmm8, %xmm4 ; SSE-NEXT: paddq %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pxor %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] ; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pxor %xmm8, %xmm4 ; SSE-NEXT: paddq %xmm6, %xmm2 diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -310,16 +310,16 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-LABEL: vec: ; X64: # %bb.0: -; X64-NEXT: pxor %xmm9, %xmm9 +; X64-NEXT: pxor %xmm2, %xmm2 ; X64-NEXT: pxor %xmm3, %xmm3 ; X64-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; X64-NEXT: movq %xmm3, %rax ; X64-NEXT: movdqa %xmm1, %xmm4 -; X64-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; X64-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; X64-NEXT: movq %xmm4, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx -; X64-NEXT: movq %rax, %xmm6 +; X64-NEXT: movq %rax, %xmm8 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; X64-NEXT: movq %xmm3, %rax ; X64-NEXT: movdqa %xmm1, %xmm3 @@ -328,50 +328,50 @@ ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm3 -; X64-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; X64-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; X64-NEXT: movdqa %xmm6, %xmm3 -; X64-NEXT: pxor %xmm10, %xmm3 +; X64-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm3[0] +; X64-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; X64-NEXT: movdqa %xmm8, %xmm3 +; X64-NEXT: pxor %xmm4, %xmm3 ; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; X64-NEXT: movdqa {{.*#+}} xmm8 = [2147483649,2147483649,2147483649,2147483649] -; X64-NEXT: pcmpeqd %xmm8, %xmm7 -; X64-NEXT: movdqa {{.*#+}} xmm2 = [9223372043297226751,9223372043297226751] -; X64-NEXT: movdqa %xmm2, %xmm5 -; X64-NEXT: pcmpgtd %xmm3, %xmm5 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2] -; X64-NEXT: pand %xmm7, %xmm4 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; X64-NEXT: por %xmm4, %xmm3 +; X64-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649,2147483649,2147483649] +; X64-NEXT: pcmpeqd %xmm6, %xmm7 +; X64-NEXT: movdqa {{.*#+}} xmm5 = [9223372043297226751,9223372043297226751] +; X64-NEXT: movdqa %xmm5, %xmm9 +; X64-NEXT: pcmpgtd %xmm3, %xmm9 +; X64-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; X64-NEXT: pand %xmm7, %xmm10 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] +; X64-NEXT: por %xmm10, %xmm3 ; X64-NEXT: movdqa {{.*#+}} xmm7 = [8589934591,8589934591] -; X64-NEXT: pand %xmm3, %xmm6 +; X64-NEXT: pand %xmm3, %xmm8 ; X64-NEXT: pandn %xmm7, %xmm3 -; X64-NEXT: por %xmm6, %xmm3 +; X64-NEXT: por %xmm8, %xmm3 ; X64-NEXT: psrlq $1, %xmm3 -; X64-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; X64-NEXT: movq %xmm9, %rax +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X64-NEXT: movq %xmm2, %rax ; X64-NEXT: movd %xmm1, %ecx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx -; X64-NEXT: movq %rax, %xmm4 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; X64-NEXT: movq %rax, %xmm8 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: psrlq $32, %xmm1 ; X64-NEXT: movq %xmm1, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm0 -; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; X64-NEXT: pxor %xmm4, %xmm10 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] -; X64-NEXT: pcmpeqd %xmm8, %xmm0 -; X64-NEXT: pcmpgtd %xmm10, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,2,2] +; X64-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; X64-NEXT: pxor %xmm8, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; X64-NEXT: pcmpeqd %xmm6, %xmm0 +; X64-NEXT: pcmpgtd %xmm4, %xmm5 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] ; X64-NEXT: pand %xmm0, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; X64-NEXT: por %xmm1, %xmm0 -; X64-NEXT: pand %xmm0, %xmm4 +; X64-NEXT: pand %xmm0, %xmm8 ; X64-NEXT: pandn %xmm7, %xmm0 -; X64-NEXT: por %xmm4, %xmm0 +; X64-NEXT: por %xmm8, %xmm0 ; X64-NEXT: psrlq $1, %xmm0 ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll --- a/llvm/test/CodeGen/X86/umul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll @@ -37,7 +37,7 @@ ; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rsi), %eax -; X64-NEXT: addl %eax, %eax +; X64-NEXT: addl %eax, %eax ; X64-NEXT: retq entry: %tmp0 = add i32 %b, %a @@ -530,93 +530,91 @@ ; X64-NEXT: pushq %rbx ; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r8, %r11 -; X64-NEXT: movq %rcx, %r10 -; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r14, %rdi -; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq %r8, %rbp +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %rbx, %r15 +; X64-NEXT: adcq $0, %r14 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rdi, %r9 -; X64-NEXT: adcq %rbx, %rcx +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r15, %rbx +; X64-NEXT: adcq %r14, %r12 ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %edi -; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: adcq %rdi, %rbx +; X64-NEXT: movzbl %al, %r8d ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r12, %r13 +; X64-NEXT: adcq %r8, %r15 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r14, %rdi -; X64-NEXT: adcq %r8, %rdx -; X64-NEXT: imulq %rcx, %r11 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; X64-NEXT: addq %rbp, %r15 -; X64-NEXT: adcq %rbx, %rdi -; X64-NEXT: adcq %rdx, %r11 -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %r12, %r8 ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %r8, %r12 ; X64-NEXT: adcq %rbp, %rdx -; X64-NEXT: imulq %rcx, %r13 -; X64-NEXT: addq %rdx, %r13 -; X64-NEXT: addq %r15, %r8 -; X64-NEXT: adcq %rdi, %rax -; X64-NEXT: adcq %r11, %r13 -; X64-NEXT: imulq %r14, %r10 -; X64-NEXT: addq %r13, %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rcx +; X64-NEXT: imulq %r9, %r11 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; X64-NEXT: addq %r13, %r14 +; X64-NEXT: adcq %r15, %r12 +; X64-NEXT: adcq %rdx, %r11 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %r8, %rbp +; X64-NEXT: adcq $0, %r13 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: addq %rbp, %rax +; X64-NEXT: adcq %r13, %rdx +; X64-NEXT: imulq %r8, %r10 +; X64-NEXT: addq %rdx, %r10 +; X64-NEXT: addq %r14, %r15 +; X64-NEXT: adcq %r12, %rax +; X64-NEXT: adcq %r11, %r10 +; X64-NEXT: imulq %r9, %rcx +; X64-NEXT: addq %r10, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rdx ; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rsi +; X64-NEXT: addq %rdx, %rsi ; X64-NEXT: addq %rcx, %rsi -; X64-NEXT: addq %r10, %rsi -; X64-NEXT: movq %r9, 8(%r12) +; X64-NEXT: movq %rbx, 8(%rdi) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, (%r12) -; X64-NEXT: movq %r8, 16(%r12) -; X64-NEXT: movq %rax, 24(%r12) -; X64-NEXT: movl %esi, 32(%r12) +; X64-NEXT: movq %rcx, (%rdi) +; X64-NEXT: movq %r15, 16(%rdi) +; X64-NEXT: movq %rax, 24(%rdi) +; X64-NEXT: movl %esi, 32(%rdi) ; X64-NEXT: shrq $32, %rsi ; X64-NEXT: andl $4095, %esi # imm = 0xFFF -; X64-NEXT: movw %si, 36(%r12) -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movw %si, 36(%rdi) +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -312,41 +312,41 @@ ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: xorb %bl, %sil +; CHECK-BASELINE-NEXT: xorb %r12b, %sil ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil -; CHECK-BASELINE-NEXT: xorb %bl, %sil -; CHECK-BASELINE-NEXT: xorb %r12b, %dl +; CHECK-BASELINE-NEXT: xorb %r12b, %sil +; CHECK-BASELINE-NEXT: xorb %r15b, %dl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl -; CHECK-BASELINE-NEXT: xorb %r12b, %dl -; CHECK-BASELINE-NEXT: xorb %r15b, %cl +; CHECK-BASELINE-NEXT: xorb %r15b, %dl +; CHECK-BASELINE-NEXT: xorb %r14b, %cl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: xorb %r15b, %cl -; CHECK-BASELINE-NEXT: xorb %r14b, %r8b +; CHECK-BASELINE-NEXT: xorb %r14b, %cl +; CHECK-BASELINE-NEXT: xorb %bpl, %r8b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b -; CHECK-BASELINE-NEXT: xorb %r14b, %r8b -; CHECK-BASELINE-NEXT: xorb %bpl, %r9b +; CHECK-BASELINE-NEXT: xorb %bpl, %r8b +; CHECK-BASELINE-NEXT: xorb %bl, %r9b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b -; CHECK-BASELINE-NEXT: xorb %bpl, %r9b -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; CHECK-BASELINE-NEXT: xorb %r11b, %bpl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl -; CHECK-BASELINE-NEXT: xorb %r11b, %bpl +; CHECK-BASELINE-NEXT: xorb %bl, %r9b +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: xorb %r11b, %bl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: xorb %r11b, %bl ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-BASELINE-NEXT: xorb %r10b, %r11b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b ; CHECK-BASELINE-NEXT: xorb %r10b, %r11b -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: xorb %dil, %bl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-BASELINE-NEXT: xorb %dil, %bl -; CHECK-BASELINE-NEXT: movb %bl, 7(%rax) +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: xorb %dil, %r10b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: xorb %dil, %r10b +; CHECK-BASELINE-NEXT: movb %r10b, 7(%rax) ; CHECK-BASELINE-NEXT: movb %r11b, 6(%rax) -; CHECK-BASELINE-NEXT: movb %bpl, 5(%rax) +; CHECK-BASELINE-NEXT: movb %bl, 5(%rax) ; CHECK-BASELINE-NEXT: movb %r9b, 4(%rax) ; CHECK-BASELINE-NEXT: movb %r8b, 3(%rax) ; CHECK-BASELINE-NEXT: movb %cl, 2(%rax) @@ -370,41 +370,41 @@ ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: xorb %bl, %sil +; CHECK-SSE1-NEXT: xorb %r12b, %sil ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil -; CHECK-SSE1-NEXT: xorb %bl, %sil -; CHECK-SSE1-NEXT: xorb %r12b, %dl +; CHECK-SSE1-NEXT: xorb %r12b, %sil +; CHECK-SSE1-NEXT: xorb %r15b, %dl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl -; CHECK-SSE1-NEXT: xorb %r12b, %dl -; CHECK-SSE1-NEXT: xorb %r15b, %cl +; CHECK-SSE1-NEXT: xorb %r15b, %dl +; CHECK-SSE1-NEXT: xorb %r14b, %cl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: xorb %r15b, %cl -; CHECK-SSE1-NEXT: xorb %r14b, %r8b +; CHECK-SSE1-NEXT: xorb %r14b, %cl +; CHECK-SSE1-NEXT: xorb %bpl, %r8b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b -; CHECK-SSE1-NEXT: xorb %r14b, %r8b -; CHECK-SSE1-NEXT: xorb %bpl, %r9b +; CHECK-SSE1-NEXT: xorb %bpl, %r8b +; CHECK-SSE1-NEXT: xorb %bl, %r9b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b -; CHECK-SSE1-NEXT: xorb %bpl, %r9b -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; CHECK-SSE1-NEXT: xorb %r11b, %bpl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl -; CHECK-SSE1-NEXT: xorb %r11b, %bpl +; CHECK-SSE1-NEXT: xorb %bl, %r9b +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: xorb %r11b, %bl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: xorb %r11b, %bl ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-SSE1-NEXT: xorb %r10b, %r11b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b ; CHECK-SSE1-NEXT: xorb %r10b, %r11b -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: xorb %dil, %bl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-SSE1-NEXT: xorb %dil, %bl -; CHECK-SSE1-NEXT: movb %bl, 7(%rax) +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: xorb %dil, %r10b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: xorb %dil, %r10b +; CHECK-SSE1-NEXT: movb %r10b, 7(%rax) ; CHECK-SSE1-NEXT: movb %r11b, 6(%rax) -; CHECK-SSE1-NEXT: movb %bpl, 5(%rax) +; CHECK-SSE1-NEXT: movb %bl, 5(%rax) ; CHECK-SSE1-NEXT: movb %r9b, 4(%rax) ; CHECK-SSE1-NEXT: movb %r8b, 3(%rax) ; CHECK-SSE1-NEXT: movb %cl, 2(%rax) @@ -439,18 +439,18 @@ ; CHECK-BASELINE-LABEL: out_v4i16: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi ; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: xorl %edi, %edx +; CHECK-BASELINE-NEXT: xorl %r11d, %edx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-BASELINE-NEXT: xorl %edi, %edx -; CHECK-BASELINE-NEXT: xorl %r11d, %ecx +; CHECK-BASELINE-NEXT: xorl %r11d, %edx +; CHECK-BASELINE-NEXT: xorl %r10d, %ecx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-BASELINE-NEXT: xorl %r11d, %ecx -; CHECK-BASELINE-NEXT: xorl %r10d, %r8d +; CHECK-BASELINE-NEXT: xorl %r10d, %ecx +; CHECK-BASELINE-NEXT: xorl %edi, %r8d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-BASELINE-NEXT: xorl %r10d, %r8d +; CHECK-BASELINE-NEXT: xorl %edi, %r8d ; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si ; CHECK-BASELINE-NEXT: xorl %r9d, %esi @@ -463,18 +463,18 @@ ; CHECK-SSE1-LABEL: out_v4i16: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi ; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: xorl %edi, %edx +; CHECK-SSE1-NEXT: xorl %r11d, %edx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-SSE1-NEXT: xorl %edi, %edx -; CHECK-SSE1-NEXT: xorl %r11d, %ecx +; CHECK-SSE1-NEXT: xorl %r11d, %edx +; CHECK-SSE1-NEXT: xorl %r10d, %ecx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-SSE1-NEXT: xorl %r11d, %ecx -; CHECK-SSE1-NEXT: xorl %r10d, %r8d +; CHECK-SSE1-NEXT: xorl %r10d, %ecx +; CHECK-SSE1-NEXT: xorl %edi, %r8d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-SSE1-NEXT: xorl %r10d, %r8d +; CHECK-SSE1-NEXT: xorl %edi, %r8d ; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si ; CHECK-SSE1-NEXT: xorl %r9d, %esi @@ -506,15 +506,15 @@ ; CHECK-BASELINE-LABEL: out_v4i16_undef: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-BASELINE-NEXT: xorl %edi, %edx +; CHECK-BASELINE-NEXT: xorl %r10d, %edx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-BASELINE-NEXT: xorl %edi, %edx -; CHECK-BASELINE-NEXT: xorl %r10d, %r8d +; CHECK-BASELINE-NEXT: xorl %r10d, %edx +; CHECK-BASELINE-NEXT: xorl %edi, %r8d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-BASELINE-NEXT: xorl %r10d, %r8d +; CHECK-BASELINE-NEXT: xorl %edi, %r8d ; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si ; CHECK-BASELINE-NEXT: xorl %r9d, %esi @@ -527,15 +527,15 @@ ; CHECK-SSE1-LABEL: out_v4i16_undef: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-SSE1-NEXT: xorl %edi, %edx +; CHECK-SSE1-NEXT: xorl %r10d, %edx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-SSE1-NEXT: xorl %edi, %edx -; CHECK-SSE1-NEXT: xorl %r10d, %r8d +; CHECK-SSE1-NEXT: xorl %r10d, %edx +; CHECK-SSE1-NEXT: xorl %edi, %r8d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-SSE1-NEXT: xorl %r10d, %r8d +; CHECK-SSE1-NEXT: xorl %edi, %r8d ; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si ; CHECK-SSE1-NEXT: xorl %r9d, %esi @@ -637,16 +637,16 @@ ; CHECK-BASELINE-NEXT: movl %edx, %r11d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edx -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: xorb %bl, %sil +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: xorb %r10b, %sil ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil -; CHECK-BASELINE-NEXT: xorb %bl, %sil +; CHECK-BASELINE-NEXT: xorb %r10b, %sil ; CHECK-BASELINE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: xorb %dl, %r11b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b @@ -655,21 +655,21 @@ ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: xorb %r10b, %r8b +; CHECK-BASELINE-NEXT: xorb %bl, %r8b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b -; CHECK-BASELINE-NEXT: xorb %r10b, %r8b +; CHECK-BASELINE-NEXT: xorb %bl, %r8b ; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: xorb %r12b, %r9b +; CHECK-BASELINE-NEXT: xorb %r14b, %r9b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b -; CHECK-BASELINE-NEXT: xorb %r12b, %r9b +; CHECK-BASELINE-NEXT: xorb %r14b, %r9b +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; CHECK-BASELINE-NEXT: xorb %r12b, %r14b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b +; CHECK-BASELINE-NEXT: xorb %r12b, %r14b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; CHECK-BASELINE-NEXT: xorb %bpl, %r12b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r12b ; CHECK-BASELINE-NEXT: xorb %bpl, %r12b -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; CHECK-BASELINE-NEXT: xorb %r14b, %bpl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl -; CHECK-BASELINE-NEXT: xorb %r14b, %bpl ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %esi ; CHECK-BASELINE-NEXT: xorb %r15b, %sil ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil @@ -693,11 +693,11 @@ ; CHECK-BASELINE-NEXT: xorb %al, %r15b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b ; CHECK-BASELINE-NEXT: xorb %al, %r15b -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: xorb %al, %r14b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b -; CHECK-BASELINE-NEXT: xorb %al, %r14b +; CHECK-BASELINE-NEXT: xorb %al, %bpl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl +; CHECK-BASELINE-NEXT: xorb %al, %bpl ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-BASELINE-NEXT: xorb %al, %bl @@ -716,14 +716,14 @@ ; CHECK-BASELINE-NEXT: movb %r10b, 15(%rdi) ; CHECK-BASELINE-NEXT: movb %al, 14(%rdi) ; CHECK-BASELINE-NEXT: movb %bl, 13(%rdi) -; CHECK-BASELINE-NEXT: movb %r14b, 12(%rdi) +; CHECK-BASELINE-NEXT: movb %bpl, 12(%rdi) ; CHECK-BASELINE-NEXT: movb %r15b, 11(%rdi) ; CHECK-BASELINE-NEXT: movb %r13b, 10(%rdi) ; CHECK-BASELINE-NEXT: movb %cl, 9(%rdi) ; CHECK-BASELINE-NEXT: movb %dl, 8(%rdi) ; CHECK-BASELINE-NEXT: movb %sil, 7(%rdi) -; CHECK-BASELINE-NEXT: movb %bpl, 6(%rdi) -; CHECK-BASELINE-NEXT: movb %r12b, 5(%rdi) +; CHECK-BASELINE-NEXT: movb %r12b, 6(%rdi) +; CHECK-BASELINE-NEXT: movb %r14b, 5(%rdi) ; CHECK-BASELINE-NEXT: movb %r9b, 4(%rdi) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-BASELINE-NEXT: movb %al, 3(%rdi) @@ -752,16 +752,16 @@ ; CHECK-SSE1-NEXT: movl %edx, %r11d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edx -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: xorb %bl, %sil +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: xorb %r10b, %sil ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil -; CHECK-SSE1-NEXT: xorb %bl, %sil +; CHECK-SSE1-NEXT: xorb %r10b, %sil ; CHECK-SSE1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: xorb %dl, %r11b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b @@ -770,21 +770,21 @@ ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: xorb %r10b, %r8b +; CHECK-SSE1-NEXT: xorb %bl, %r8b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b -; CHECK-SSE1-NEXT: xorb %r10b, %r8b +; CHECK-SSE1-NEXT: xorb %bl, %r8b ; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: xorb %r12b, %r9b +; CHECK-SSE1-NEXT: xorb %r14b, %r9b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b -; CHECK-SSE1-NEXT: xorb %r12b, %r9b +; CHECK-SSE1-NEXT: xorb %r14b, %r9b +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; CHECK-SSE1-NEXT: xorb %r12b, %r14b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b +; CHECK-SSE1-NEXT: xorb %r12b, %r14b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; CHECK-SSE1-NEXT: xorb %bpl, %r12b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r12b ; CHECK-SSE1-NEXT: xorb %bpl, %r12b -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; CHECK-SSE1-NEXT: xorb %r14b, %bpl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl -; CHECK-SSE1-NEXT: xorb %r14b, %bpl ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi ; CHECK-SSE1-NEXT: xorb %r15b, %sil ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil @@ -808,11 +808,11 @@ ; CHECK-SSE1-NEXT: xorb %al, %r15b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b ; CHECK-SSE1-NEXT: xorb %al, %r15b -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: xorb %al, %r14b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b -; CHECK-SSE1-NEXT: xorb %al, %r14b +; CHECK-SSE1-NEXT: xorb %al, %bpl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl +; CHECK-SSE1-NEXT: xorb %al, %bpl ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-SSE1-NEXT: xorb %al, %bl @@ -831,14 +831,14 @@ ; CHECK-SSE1-NEXT: movb %r10b, 15(%rdi) ; CHECK-SSE1-NEXT: movb %al, 14(%rdi) ; CHECK-SSE1-NEXT: movb %bl, 13(%rdi) -; CHECK-SSE1-NEXT: movb %r14b, 12(%rdi) +; CHECK-SSE1-NEXT: movb %bpl, 12(%rdi) ; CHECK-SSE1-NEXT: movb %r15b, 11(%rdi) ; CHECK-SSE1-NEXT: movb %r13b, 10(%rdi) ; CHECK-SSE1-NEXT: movb %cl, 9(%rdi) ; CHECK-SSE1-NEXT: movb %dl, 8(%rdi) ; CHECK-SSE1-NEXT: movb %sil, 7(%rdi) -; CHECK-SSE1-NEXT: movb %bpl, 6(%rdi) -; CHECK-SSE1-NEXT: movb %r12b, 5(%rdi) +; CHECK-SSE1-NEXT: movb %r12b, 6(%rdi) +; CHECK-SSE1-NEXT: movb %r14b, 5(%rdi) ; CHECK-SSE1-NEXT: movb %r9b, 4(%rdi) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-SSE1-NEXT: movb %al, 3(%rdi) @@ -883,44 +883,44 @@ ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi ; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp ; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d ; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d ; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp -; CHECK-BASELINE-NEXT: xorl %ebp, %esi +; CHECK-BASELINE-NEXT: xorl %r12d, %esi ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si -; CHECK-BASELINE-NEXT: xorl %ebp, %esi -; CHECK-BASELINE-NEXT: xorl %ebx, %edx +; CHECK-BASELINE-NEXT: xorl %r12d, %esi +; CHECK-BASELINE-NEXT: xorl %r15d, %edx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-BASELINE-NEXT: xorl %ebx, %edx -; CHECK-BASELINE-NEXT: xorl %edi, %ecx +; CHECK-BASELINE-NEXT: xorl %r15d, %edx +; CHECK-BASELINE-NEXT: xorl %r14d, %ecx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-BASELINE-NEXT: xorl %edi, %ecx -; CHECK-BASELINE-NEXT: xorl %r12d, %r8d +; CHECK-BASELINE-NEXT: xorl %r14d, %ecx +; CHECK-BASELINE-NEXT: xorl %ebp, %r8d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-BASELINE-NEXT: xorl %r12d, %r8d -; CHECK-BASELINE-NEXT: xorl %r15d, %r9d +; CHECK-BASELINE-NEXT: xorl %ebp, %r8d +; CHECK-BASELINE-NEXT: xorl %ebx, %r9d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r9w -; CHECK-BASELINE-NEXT: xorl %r15d, %r9d -; CHECK-BASELINE-NEXT: movl %r14d, %edi -; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %di -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %di -; CHECK-BASELINE-NEXT: xorl %r14d, %edi +; CHECK-BASELINE-NEXT: xorl %ebx, %r9d ; CHECK-BASELINE-NEXT: movl %r11d, %ebx ; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %bx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bx ; CHECK-BASELINE-NEXT: xorl %r11d, %ebx -; CHECK-BASELINE-NEXT: movl %r10d, %ebp -; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %bp -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bp -; CHECK-BASELINE-NEXT: xorl %r10d, %ebp -; CHECK-BASELINE-NEXT: movw %bp, 14(%rax) -; CHECK-BASELINE-NEXT: movw %bx, 12(%rax) -; CHECK-BASELINE-NEXT: movw %di, 10(%rax) +; CHECK-BASELINE-NEXT: movl %r10d, %r11d +; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %r11w +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w +; CHECK-BASELINE-NEXT: xorl %r10d, %r11d +; CHECK-BASELINE-NEXT: movl %edi, %r10d +; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %r10w +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-BASELINE-NEXT: xorl %edi, %r10d +; CHECK-BASELINE-NEXT: movw %r10w, 14(%rax) +; CHECK-BASELINE-NEXT: movw %r11w, 12(%rax) +; CHECK-BASELINE-NEXT: movw %bx, 10(%rax) ; CHECK-BASELINE-NEXT: movw %r9w, 8(%rax) ; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) ; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) @@ -941,44 +941,44 @@ ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi ; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp ; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d ; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d ; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp -; CHECK-SSE1-NEXT: xorl %ebp, %esi +; CHECK-SSE1-NEXT: xorl %r12d, %esi ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si -; CHECK-SSE1-NEXT: xorl %ebp, %esi -; CHECK-SSE1-NEXT: xorl %ebx, %edx +; CHECK-SSE1-NEXT: xorl %r12d, %esi +; CHECK-SSE1-NEXT: xorl %r15d, %edx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-SSE1-NEXT: xorl %ebx, %edx -; CHECK-SSE1-NEXT: xorl %edi, %ecx +; CHECK-SSE1-NEXT: xorl %r15d, %edx +; CHECK-SSE1-NEXT: xorl %r14d, %ecx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-SSE1-NEXT: xorl %edi, %ecx -; CHECK-SSE1-NEXT: xorl %r12d, %r8d +; CHECK-SSE1-NEXT: xorl %r14d, %ecx +; CHECK-SSE1-NEXT: xorl %ebp, %r8d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-SSE1-NEXT: xorl %r12d, %r8d -; CHECK-SSE1-NEXT: xorl %r15d, %r9d +; CHECK-SSE1-NEXT: xorl %ebp, %r8d +; CHECK-SSE1-NEXT: xorl %ebx, %r9d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r9w -; CHECK-SSE1-NEXT: xorl %r15d, %r9d -; CHECK-SSE1-NEXT: movl %r14d, %edi -; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %di -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %di -; CHECK-SSE1-NEXT: xorl %r14d, %edi +; CHECK-SSE1-NEXT: xorl %ebx, %r9d ; CHECK-SSE1-NEXT: movl %r11d, %ebx ; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %bx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bx ; CHECK-SSE1-NEXT: xorl %r11d, %ebx -; CHECK-SSE1-NEXT: movl %r10d, %ebp -; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %bp -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bp -; CHECK-SSE1-NEXT: xorl %r10d, %ebp -; CHECK-SSE1-NEXT: movw %bp, 14(%rax) -; CHECK-SSE1-NEXT: movw %bx, 12(%rax) -; CHECK-SSE1-NEXT: movw %di, 10(%rax) +; CHECK-SSE1-NEXT: movl %r10d, %r11d +; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %r11w +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w +; CHECK-SSE1-NEXT: xorl %r10d, %r11d +; CHECK-SSE1-NEXT: movl %edi, %r10d +; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %r10w +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-SSE1-NEXT: xorl %edi, %r10d +; CHECK-SSE1-NEXT: movw %r10w, 14(%rax) +; CHECK-SSE1-NEXT: movw %r11w, 12(%rax) +; CHECK-SSE1-NEXT: movw %bx, 10(%rax) ; CHECK-SSE1-NEXT: movw %r9w, 8(%rax) ; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) ; CHECK-SSE1-NEXT: movw %cx, 4(%rax) @@ -1013,30 +1013,30 @@ ; CHECK-BASELINE-LABEL: out_v4i32: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movl 12(%rdx), %r8d -; CHECK-BASELINE-NEXT: movl 8(%rdx), %r9d -; CHECK-BASELINE-NEXT: movl (%rdx), %edi +; CHECK-BASELINE-NEXT: movl 12(%rdx), %edi +; CHECK-BASELINE-NEXT: movl 8(%rdx), %r8d +; CHECK-BASELINE-NEXT: movl (%rdx), %r9d ; CHECK-BASELINE-NEXT: movl 4(%rdx), %r10d -; CHECK-BASELINE-NEXT: movl (%rsi), %r11d -; CHECK-BASELINE-NEXT: xorl %edi, %r11d -; CHECK-BASELINE-NEXT: andl (%rcx), %r11d -; CHECK-BASELINE-NEXT: xorl %edi, %r11d -; CHECK-BASELINE-NEXT: movl 4(%rsi), %edi -; CHECK-BASELINE-NEXT: xorl %r10d, %edi -; CHECK-BASELINE-NEXT: andl 4(%rcx), %edi -; CHECK-BASELINE-NEXT: xorl %r10d, %edi -; CHECK-BASELINE-NEXT: movl 8(%rsi), %edx +; CHECK-BASELINE-NEXT: movl (%rsi), %edx ; CHECK-BASELINE-NEXT: xorl %r9d, %edx -; CHECK-BASELINE-NEXT: andl 8(%rcx), %edx +; CHECK-BASELINE-NEXT: andl (%rcx), %edx ; CHECK-BASELINE-NEXT: xorl %r9d, %edx +; CHECK-BASELINE-NEXT: movl 4(%rsi), %r9d +; CHECK-BASELINE-NEXT: xorl %r10d, %r9d +; CHECK-BASELINE-NEXT: andl 4(%rcx), %r9d +; CHECK-BASELINE-NEXT: xorl %r10d, %r9d +; CHECK-BASELINE-NEXT: movl 8(%rsi), %r10d +; CHECK-BASELINE-NEXT: xorl %r8d, %r10d +; CHECK-BASELINE-NEXT: andl 8(%rcx), %r10d +; CHECK-BASELINE-NEXT: xorl %r8d, %r10d ; CHECK-BASELINE-NEXT: movl 12(%rsi), %esi -; CHECK-BASELINE-NEXT: xorl %r8d, %esi +; CHECK-BASELINE-NEXT: xorl %edi, %esi ; CHECK-BASELINE-NEXT: andl 12(%rcx), %esi -; CHECK-BASELINE-NEXT: xorl %r8d, %esi +; CHECK-BASELINE-NEXT: xorl %edi, %esi ; CHECK-BASELINE-NEXT: movl %esi, 12(%rax) -; CHECK-BASELINE-NEXT: movl %edx, 8(%rax) -; CHECK-BASELINE-NEXT: movl %edi, 4(%rax) -; CHECK-BASELINE-NEXT: movl %r11d, (%rax) +; CHECK-BASELINE-NEXT: movl %r10d, 8(%rax) +; CHECK-BASELINE-NEXT: movl %r9d, 4(%rax) +; CHECK-BASELINE-NEXT: movl %edx, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i32: @@ -1079,27 +1079,27 @@ ; CHECK-BASELINE-LABEL: out_v4i32_undef: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movl 8(%rsi), %r9d +; CHECK-BASELINE-NEXT: movl 8(%rsi), %edi ; CHECK-BASELINE-NEXT: movl 12(%rdx), %r8d -; CHECK-BASELINE-NEXT: movl (%rdx), %edi -; CHECK-BASELINE-NEXT: movl 4(%rdx), %r10d -; CHECK-BASELINE-NEXT: andl 8(%rcx), %r9d -; CHECK-BASELINE-NEXT: movl (%rsi), %edx -; CHECK-BASELINE-NEXT: xorl %edi, %edx -; CHECK-BASELINE-NEXT: andl (%rcx), %edx -; CHECK-BASELINE-NEXT: xorl %edi, %edx -; CHECK-BASELINE-NEXT: movl 4(%rsi), %edi -; CHECK-BASELINE-NEXT: xorl %r10d, %edi -; CHECK-BASELINE-NEXT: andl 4(%rcx), %edi -; CHECK-BASELINE-NEXT: xorl %r10d, %edi -; CHECK-BASELINE-NEXT: movl 12(%rsi), %esi -; CHECK-BASELINE-NEXT: xorl %r8d, %esi -; CHECK-BASELINE-NEXT: andl 12(%rcx), %esi -; CHECK-BASELINE-NEXT: xorl %r8d, %esi -; CHECK-BASELINE-NEXT: movl %r9d, 8(%rax) -; CHECK-BASELINE-NEXT: movl %esi, 12(%rax) -; CHECK-BASELINE-NEXT: movl %edi, 4(%rax) -; CHECK-BASELINE-NEXT: movl %edx, (%rax) +; CHECK-BASELINE-NEXT: movl (%rdx), %r9d +; CHECK-BASELINE-NEXT: movl 4(%rdx), %edx +; CHECK-BASELINE-NEXT: andl 8(%rcx), %edi +; CHECK-BASELINE-NEXT: movl (%rsi), %r10d +; CHECK-BASELINE-NEXT: xorl %r9d, %r10d +; CHECK-BASELINE-NEXT: andl (%rcx), %r10d +; CHECK-BASELINE-NEXT: xorl %r9d, %r10d +; CHECK-BASELINE-NEXT: movl 4(%rsi), %r9d +; CHECK-BASELINE-NEXT: xorl %edx, %r9d +; CHECK-BASELINE-NEXT: andl 4(%rcx), %r9d +; CHECK-BASELINE-NEXT: xorl %edx, %r9d +; CHECK-BASELINE-NEXT: movl 12(%rsi), %edx +; CHECK-BASELINE-NEXT: xorl %r8d, %edx +; CHECK-BASELINE-NEXT: andl 12(%rcx), %edx +; CHECK-BASELINE-NEXT: xorl %r8d, %edx +; CHECK-BASELINE-NEXT: movl %edi, 8(%rax) +; CHECK-BASELINE-NEXT: movl %edx, 12(%rax) +; CHECK-BASELINE-NEXT: movl %r9d, 4(%rax) +; CHECK-BASELINE-NEXT: movl %r10d, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i32_undef: @@ -1210,21 +1210,21 @@ ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 10(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %ebp -; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r14d -; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r15d -; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r12d -; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %esi +; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %ebx +; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %ebp +; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r14d +; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r15d +; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %r12d ; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %r13d ; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %edx ; CHECK-BASELINE-NEXT: movzbl 2(%r8), %edi ; CHECK-BASELINE-NEXT: movzbl (%r8), %eax ; CHECK-BASELINE-NEXT: movzbl 1(%r8), %ecx -; CHECK-BASELINE-NEXT: movzbl (%r9), %ebx -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: andb (%r10), %bl -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl (%rsi), %esi +; CHECK-BASELINE-NEXT: xorb %al, %sil +; CHECK-BASELINE-NEXT: andb (%r10), %sil +; CHECK-BASELINE-NEXT: xorb %al, %sil +; CHECK-BASELINE-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 1(%r9), %eax ; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: andb 1(%r10), %al @@ -1246,29 +1246,29 @@ ; CHECK-BASELINE-NEXT: xorb %r13b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 5(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %sil, %al +; CHECK-BASELINE-NEXT: xorb %r12b, %al ; CHECK-BASELINE-NEXT: andb 5(%r10), %al -; CHECK-BASELINE-NEXT: xorb %sil, %al +; CHECK-BASELINE-NEXT: xorb %r12b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 6(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %r12b, %al +; CHECK-BASELINE-NEXT: xorb %r15b, %al ; CHECK-BASELINE-NEXT: andb 6(%r10), %al -; CHECK-BASELINE-NEXT: xorb %r12b, %al +; CHECK-BASELINE-NEXT: xorb %r15b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 7(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: xorb %r14b, %al ; CHECK-BASELINE-NEXT: andb 7(%r10), %al -; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: xorb %r14b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 8(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %r14b, %al +; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: andb 8(%r10), %al -; CHECK-BASELINE-NEXT: xorb %r14b, %al +; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 9(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: xorb %bl, %al ; CHECK-BASELINE-NEXT: andb 9(%r10), %al -; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: xorb %bl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 10(%r9), %eax ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload @@ -1357,10 +1357,10 @@ ; CHECK-BASELINE-NEXT: andb 24(%r10), %r14b ; CHECK-BASELINE-NEXT: xorb %al, %r14b ; CHECK-BASELINE-NEXT: movzbl 25(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 25(%r9), %ebp -; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: andb 25(%r10), %bpl -; CHECK-BASELINE-NEXT: xorb %al, %bpl +; CHECK-BASELINE-NEXT: movzbl 25(%r9), %ebx +; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: andb 25(%r10), %bl +; CHECK-BASELINE-NEXT: xorb %al, %bl ; CHECK-BASELINE-NEXT: movzbl 26(%r8), %eax ; CHECK-BASELINE-NEXT: movzbl 26(%r9), %edi ; CHECK-BASELINE-NEXT: xorb %al, %dil @@ -1381,23 +1381,23 @@ ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: andb 29(%r10), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movzbl 30(%r8), %ebx +; CHECK-BASELINE-NEXT: movzbl 30(%r8), %ebp ; CHECK-BASELINE-NEXT: movzbl 30(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %bl, %al +; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: andb 30(%r10), %al -; CHECK-BASELINE-NEXT: xorb %bl, %al +; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: movzbl 31(%r8), %r8d -; CHECK-BASELINE-NEXT: movzbl 31(%r9), %ebx -; CHECK-BASELINE-NEXT: xorb %r8b, %bl -; CHECK-BASELINE-NEXT: andb 31(%r10), %bl -; CHECK-BASELINE-NEXT: xorb %r8b, %bl -; CHECK-BASELINE-NEXT: movb %bl, 31(%r11) +; CHECK-BASELINE-NEXT: movzbl 31(%r9), %r9d +; CHECK-BASELINE-NEXT: xorb %r8b, %r9b +; CHECK-BASELINE-NEXT: andb 31(%r10), %r9b +; CHECK-BASELINE-NEXT: xorb %r8b, %r9b +; CHECK-BASELINE-NEXT: movb %r9b, 31(%r11) ; CHECK-BASELINE-NEXT: movb %al, 30(%r11) ; CHECK-BASELINE-NEXT: movb %cl, 29(%r11) ; CHECK-BASELINE-NEXT: movb %dl, 28(%r11) ; CHECK-BASELINE-NEXT: movb %sil, 27(%r11) ; CHECK-BASELINE-NEXT: movb %dil, 26(%r11) -; CHECK-BASELINE-NEXT: movb %bpl, 25(%r11) +; CHECK-BASELINE-NEXT: movb %bl, 25(%r11) ; CHECK-BASELINE-NEXT: movb %r14b, 24(%r11) ; CHECK-BASELINE-NEXT: movb %r15b, 23(%r11) ; CHECK-BASELINE-NEXT: movb %r12b, 22(%r11) @@ -1477,21 +1477,21 @@ ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 10(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 9(%rdx), %ebp -; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r14d -; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r15d -; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r12d -; CHECK-SSE1-NEXT: movzbl 5(%rdx), %esi +; CHECK-SSE1-NEXT: movzbl 9(%rdx), %ebx +; CHECK-SSE1-NEXT: movzbl 8(%rdx), %ebp +; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r14d +; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r15d +; CHECK-SSE1-NEXT: movzbl 5(%rdx), %r12d ; CHECK-SSE1-NEXT: movzbl 4(%rdx), %r13d ; CHECK-SSE1-NEXT: movzbl 3(%rdx), %edx ; CHECK-SSE1-NEXT: movzbl 2(%r8), %edi ; CHECK-SSE1-NEXT: movzbl (%r8), %eax ; CHECK-SSE1-NEXT: movzbl 1(%r8), %ecx -; CHECK-SSE1-NEXT: movzbl (%r9), %ebx -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: andb (%r10), %bl -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl (%rsi), %esi +; CHECK-SSE1-NEXT: xorb %al, %sil +; CHECK-SSE1-NEXT: andb (%r10), %sil +; CHECK-SSE1-NEXT: xorb %al, %sil +; CHECK-SSE1-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 1(%r9), %eax ; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: andb 1(%r10), %al @@ -1513,29 +1513,29 @@ ; CHECK-SSE1-NEXT: xorb %r13b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 5(%r9), %eax -; CHECK-SSE1-NEXT: xorb %sil, %al +; CHECK-SSE1-NEXT: xorb %r12b, %al ; CHECK-SSE1-NEXT: andb 5(%r10), %al -; CHECK-SSE1-NEXT: xorb %sil, %al +; CHECK-SSE1-NEXT: xorb %r12b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 6(%r9), %eax -; CHECK-SSE1-NEXT: xorb %r12b, %al +; CHECK-SSE1-NEXT: xorb %r15b, %al ; CHECK-SSE1-NEXT: andb 6(%r10), %al -; CHECK-SSE1-NEXT: xorb %r12b, %al +; CHECK-SSE1-NEXT: xorb %r15b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 7(%r9), %eax -; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: xorb %r14b, %al ; CHECK-SSE1-NEXT: andb 7(%r10), %al -; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: xorb %r14b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 8(%r9), %eax -; CHECK-SSE1-NEXT: xorb %r14b, %al +; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: andb 8(%r10), %al -; CHECK-SSE1-NEXT: xorb %r14b, %al +; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 9(%r9), %eax -; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: xorb %bl, %al ; CHECK-SSE1-NEXT: andb 9(%r10), %al -; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: xorb %bl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 10(%r9), %eax ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload @@ -1624,10 +1624,10 @@ ; CHECK-SSE1-NEXT: andb 24(%r10), %r14b ; CHECK-SSE1-NEXT: xorb %al, %r14b ; CHECK-SSE1-NEXT: movzbl 25(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 25(%r9), %ebp -; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: andb 25(%r10), %bpl -; CHECK-SSE1-NEXT: xorb %al, %bpl +; CHECK-SSE1-NEXT: movzbl 25(%r9), %ebx +; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: andb 25(%r10), %bl +; CHECK-SSE1-NEXT: xorb %al, %bl ; CHECK-SSE1-NEXT: movzbl 26(%r8), %eax ; CHECK-SSE1-NEXT: movzbl 26(%r9), %edi ; CHECK-SSE1-NEXT: xorb %al, %dil @@ -1648,23 +1648,23 @@ ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: andb 29(%r10), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movzbl 30(%r8), %ebx +; CHECK-SSE1-NEXT: movzbl 30(%r8), %ebp ; CHECK-SSE1-NEXT: movzbl 30(%r9), %eax -; CHECK-SSE1-NEXT: xorb %bl, %al +; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: andb 30(%r10), %al -; CHECK-SSE1-NEXT: xorb %bl, %al +; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: movzbl 31(%r8), %r8d -; CHECK-SSE1-NEXT: movzbl 31(%r9), %ebx -; CHECK-SSE1-NEXT: xorb %r8b, %bl -; CHECK-SSE1-NEXT: andb 31(%r10), %bl -; CHECK-SSE1-NEXT: xorb %r8b, %bl -; CHECK-SSE1-NEXT: movb %bl, 31(%r11) +; CHECK-SSE1-NEXT: movzbl 31(%r9), %r9d +; CHECK-SSE1-NEXT: xorb %r8b, %r9b +; CHECK-SSE1-NEXT: andb 31(%r10), %r9b +; CHECK-SSE1-NEXT: xorb %r8b, %r9b +; CHECK-SSE1-NEXT: movb %r9b, 31(%r11) ; CHECK-SSE1-NEXT: movb %al, 30(%r11) ; CHECK-SSE1-NEXT: movb %cl, 29(%r11) ; CHECK-SSE1-NEXT: movb %dl, 28(%r11) ; CHECK-SSE1-NEXT: movb %sil, 27(%r11) ; CHECK-SSE1-NEXT: movb %dil, 26(%r11) -; CHECK-SSE1-NEXT: movb %bpl, 25(%r11) +; CHECK-SSE1-NEXT: movb %bl, 25(%r11) ; CHECK-SSE1-NEXT: movb %r14b, 24(%r11) ; CHECK-SSE1-NEXT: movb %r15b, 23(%r11) ; CHECK-SSE1-NEXT: movb %r12b, 22(%r11) @@ -1761,51 +1761,51 @@ ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r15d ; CHECK-BASELINE-NEXT: movzwl 16(%rdx), %r14d -; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %r11d -; CHECK-BASELINE-NEXT: movzwl 12(%rdx), %r13d -; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r9d -; CHECK-BASELINE-NEXT: movzwl 8(%rdx), %r8d -; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %ebx -; CHECK-BASELINE-NEXT: movzwl 4(%rdx), %r12d -; CHECK-BASELINE-NEXT: movzwl (%rdx), %ebp -; CHECK-BASELINE-NEXT: movzwl 2(%rdx), %r10d +; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %ebp +; CHECK-BASELINE-NEXT: movzwl 12(%rdx), %ebx +; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r13d +; CHECK-BASELINE-NEXT: movzwl 8(%rdx), %r11d +; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %r10d +; CHECK-BASELINE-NEXT: movzwl 4(%rdx), %r9d +; CHECK-BASELINE-NEXT: movzwl (%rdx), %r8d +; CHECK-BASELINE-NEXT: movzwl 2(%rdx), %r12d ; CHECK-BASELINE-NEXT: movzwl (%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %bp, %ax +; CHECK-BASELINE-NEXT: xorw %r8w, %ax ; CHECK-BASELINE-NEXT: andw (%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %ebp -; CHECK-BASELINE-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: xorl %eax, %r8d +; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 2(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r10w, %ax +; CHECK-BASELINE-NEXT: xorw %r12w, %ax ; CHECK-BASELINE-NEXT: andw 2(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r10d +; CHECK-BASELINE-NEXT: xorl %eax, %r12d ; CHECK-BASELINE-NEXT: movzwl 4(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r12w, %ax +; CHECK-BASELINE-NEXT: xorw %r9w, %ax ; CHECK-BASELINE-NEXT: andw 4(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r12d +; CHECK-BASELINE-NEXT: xorl %eax, %r9d +; CHECK-BASELINE-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %bx, %ax +; CHECK-BASELINE-NEXT: xorw %r10w, %ax ; CHECK-BASELINE-NEXT: andw 6(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %ebx -; CHECK-BASELINE-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: xorl %eax, %r10d +; CHECK-BASELINE-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r8w, %ax +; CHECK-BASELINE-NEXT: xorw %r11w, %ax ; CHECK-BASELINE-NEXT: andw 8(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r8d -; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: xorl %eax, %r11d +; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r9w, %ax +; CHECK-BASELINE-NEXT: xorw %r13w, %ax ; CHECK-BASELINE-NEXT: andw 10(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r9d -; CHECK-BASELINE-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: xorl %eax, %r13d +; CHECK-BASELINE-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 12(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r13w, %ax +; CHECK-BASELINE-NEXT: xorw %bx, %ax ; CHECK-BASELINE-NEXT: andw 12(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r13d +; CHECK-BASELINE-NEXT: xorl %eax, %ebx ; CHECK-BASELINE-NEXT: movzwl 14(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r11w, %ax +; CHECK-BASELINE-NEXT: xorw %bp, %ax ; CHECK-BASELINE-NEXT: andw 14(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r11d -; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: xorl %eax, %ebp ; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %eax ; CHECK-BASELINE-NEXT: xorw %r14w, %ax ; CHECK-BASELINE-NEXT: andw 16(%rcx), %ax @@ -1814,11 +1814,11 @@ ; CHECK-BASELINE-NEXT: xorw %r15w, %ax ; CHECK-BASELINE-NEXT: andw 18(%rcx), %ax ; CHECK-BASELINE-NEXT: xorl %eax, %r15d -; CHECK-BASELINE-NEXT: movzwl 20(%rdx), %ebx +; CHECK-BASELINE-NEXT: movzwl 20(%rdx), %r13d ; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %bx, %ax +; CHECK-BASELINE-NEXT: xorw %r13w, %ax ; CHECK-BASELINE-NEXT: andw 20(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %ebx +; CHECK-BASELINE-NEXT: xorl %eax, %r13d ; CHECK-BASELINE-NEXT: movzwl 22(%rdx), %r9d ; CHECK-BASELINE-NEXT: movzwl 22(%rsi), %eax ; CHECK-BASELINE-NEXT: xorw %r9w, %ax @@ -1830,39 +1830,39 @@ ; CHECK-BASELINE-NEXT: andw 24(%rcx), %ax ; CHECK-BASELINE-NEXT: xorl %eax, %r8d ; CHECK-BASELINE-NEXT: movzwl 26(%rdx), %eax -; CHECK-BASELINE-NEXT: movzwl 26(%rsi), %r11d -; CHECK-BASELINE-NEXT: xorw %ax, %r11w -; CHECK-BASELINE-NEXT: andw 26(%rcx), %r11w -; CHECK-BASELINE-NEXT: xorl %r11d, %eax -; CHECK-BASELINE-NEXT: movzwl 28(%rdx), %r11d -; CHECK-BASELINE-NEXT: movzwl 28(%rsi), %ebp -; CHECK-BASELINE-NEXT: xorw %r11w, %bp -; CHECK-BASELINE-NEXT: andw 28(%rcx), %bp -; CHECK-BASELINE-NEXT: xorl %ebp, %r11d +; CHECK-BASELINE-NEXT: movzwl 26(%rsi), %r10d +; CHECK-BASELINE-NEXT: xorw %ax, %r10w +; CHECK-BASELINE-NEXT: andw 26(%rcx), %r10w +; CHECK-BASELINE-NEXT: xorl %r10d, %eax +; CHECK-BASELINE-NEXT: movzwl 28(%rdx), %r10d +; CHECK-BASELINE-NEXT: movzwl 28(%rsi), %r11d +; CHECK-BASELINE-NEXT: xorw %r10w, %r11w +; CHECK-BASELINE-NEXT: andw 28(%rcx), %r11w +; CHECK-BASELINE-NEXT: xorl %r11d, %r10d ; CHECK-BASELINE-NEXT: movzwl 30(%rdx), %edx ; CHECK-BASELINE-NEXT: movzwl 30(%rsi), %esi ; CHECK-BASELINE-NEXT: xorw %dx, %si ; CHECK-BASELINE-NEXT: andw 30(%rcx), %si ; CHECK-BASELINE-NEXT: xorl %esi, %edx ; CHECK-BASELINE-NEXT: movw %dx, 30(%rdi) -; CHECK-BASELINE-NEXT: movw %r11w, 28(%rdi) +; CHECK-BASELINE-NEXT: movw %r10w, 28(%rdi) ; CHECK-BASELINE-NEXT: movw %ax, 26(%rdi) ; CHECK-BASELINE-NEXT: movw %r8w, 24(%rdi) ; CHECK-BASELINE-NEXT: movw %r9w, 22(%rdi) -; CHECK-BASELINE-NEXT: movw %bx, 20(%rdi) +; CHECK-BASELINE-NEXT: movw %r13w, 20(%rdi) ; CHECK-BASELINE-NEXT: movw %r15w, 18(%rdi) ; CHECK-BASELINE-NEXT: movw %r14w, 16(%rdi) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 14(%rdi) -; CHECK-BASELINE-NEXT: movw %r13w, 12(%rdi) +; CHECK-BASELINE-NEXT: movw %bp, 14(%rdi) +; CHECK-BASELINE-NEXT: movw %bx, 12(%rdi) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-BASELINE-NEXT: movw %ax, 10(%rdi) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-BASELINE-NEXT: movw %ax, 8(%rdi) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-BASELINE-NEXT: movw %ax, 6(%rdi) -; CHECK-BASELINE-NEXT: movw %r12w, 4(%rdi) -; CHECK-BASELINE-NEXT: movw %r10w, 2(%rdi) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %ax, 4(%rdi) +; CHECK-BASELINE-NEXT: movw %r12w, 2(%rdi) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-BASELINE-NEXT: movw %ax, (%rdi) ; CHECK-BASELINE-NEXT: movq %rdi, %rax @@ -1884,51 +1884,51 @@ ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r15d ; CHECK-SSE1-NEXT: movzwl 16(%rdx), %r14d -; CHECK-SSE1-NEXT: movzwl 14(%rdx), %r11d -; CHECK-SSE1-NEXT: movzwl 12(%rdx), %r13d -; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r9d -; CHECK-SSE1-NEXT: movzwl 8(%rdx), %r8d -; CHECK-SSE1-NEXT: movzwl 6(%rdx), %ebx -; CHECK-SSE1-NEXT: movzwl 4(%rdx), %r12d -; CHECK-SSE1-NEXT: movzwl (%rdx), %ebp -; CHECK-SSE1-NEXT: movzwl 2(%rdx), %r10d +; CHECK-SSE1-NEXT: movzwl 14(%rdx), %ebp +; CHECK-SSE1-NEXT: movzwl 12(%rdx), %ebx +; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r13d +; CHECK-SSE1-NEXT: movzwl 8(%rdx), %r11d +; CHECK-SSE1-NEXT: movzwl 6(%rdx), %r10d +; CHECK-SSE1-NEXT: movzwl 4(%rdx), %r9d +; CHECK-SSE1-NEXT: movzwl (%rdx), %r8d +; CHECK-SSE1-NEXT: movzwl 2(%rdx), %r12d ; CHECK-SSE1-NEXT: movzwl (%rsi), %eax -; CHECK-SSE1-NEXT: xorw %bp, %ax +; CHECK-SSE1-NEXT: xorw %r8w, %ax ; CHECK-SSE1-NEXT: andw (%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %ebp -; CHECK-SSE1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: xorl %eax, %r8d +; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 2(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r10w, %ax +; CHECK-SSE1-NEXT: xorw %r12w, %ax ; CHECK-SSE1-NEXT: andw 2(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r10d +; CHECK-SSE1-NEXT: xorl %eax, %r12d ; CHECK-SSE1-NEXT: movzwl 4(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r12w, %ax +; CHECK-SSE1-NEXT: xorw %r9w, %ax ; CHECK-SSE1-NEXT: andw 4(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r12d +; CHECK-SSE1-NEXT: xorl %eax, %r9d +; CHECK-SSE1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 6(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %bx, %ax +; CHECK-SSE1-NEXT: xorw %r10w, %ax ; CHECK-SSE1-NEXT: andw 6(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %ebx -; CHECK-SSE1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: xorl %eax, %r10d +; CHECK-SSE1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 8(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r8w, %ax +; CHECK-SSE1-NEXT: xorw %r11w, %ax ; CHECK-SSE1-NEXT: andw 8(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r8d -; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: xorl %eax, %r11d +; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 10(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r9w, %ax +; CHECK-SSE1-NEXT: xorw %r13w, %ax ; CHECK-SSE1-NEXT: andw 10(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r9d -; CHECK-SSE1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: xorl %eax, %r13d +; CHECK-SSE1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 12(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r13w, %ax +; CHECK-SSE1-NEXT: xorw %bx, %ax ; CHECK-SSE1-NEXT: andw 12(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r13d +; CHECK-SSE1-NEXT: xorl %eax, %ebx ; CHECK-SSE1-NEXT: movzwl 14(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r11w, %ax +; CHECK-SSE1-NEXT: xorw %bp, %ax ; CHECK-SSE1-NEXT: andw 14(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r11d -; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: xorl %eax, %ebp ; CHECK-SSE1-NEXT: movzwl 16(%rsi), %eax ; CHECK-SSE1-NEXT: xorw %r14w, %ax ; CHECK-SSE1-NEXT: andw 16(%rcx), %ax @@ -1937,11 +1937,11 @@ ; CHECK-SSE1-NEXT: xorw %r15w, %ax ; CHECK-SSE1-NEXT: andw 18(%rcx), %ax ; CHECK-SSE1-NEXT: xorl %eax, %r15d -; CHECK-SSE1-NEXT: movzwl 20(%rdx), %ebx +; CHECK-SSE1-NEXT: movzwl 20(%rdx), %r13d ; CHECK-SSE1-NEXT: movzwl 20(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %bx, %ax +; CHECK-SSE1-NEXT: xorw %r13w, %ax ; CHECK-SSE1-NEXT: andw 20(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %ebx +; CHECK-SSE1-NEXT: xorl %eax, %r13d ; CHECK-SSE1-NEXT: movzwl 22(%rdx), %r9d ; CHECK-SSE1-NEXT: movzwl 22(%rsi), %eax ; CHECK-SSE1-NEXT: xorw %r9w, %ax @@ -1953,39 +1953,39 @@ ; CHECK-SSE1-NEXT: andw 24(%rcx), %ax ; CHECK-SSE1-NEXT: xorl %eax, %r8d ; CHECK-SSE1-NEXT: movzwl 26(%rdx), %eax -; CHECK-SSE1-NEXT: movzwl 26(%rsi), %r11d -; CHECK-SSE1-NEXT: xorw %ax, %r11w -; CHECK-SSE1-NEXT: andw 26(%rcx), %r11w -; CHECK-SSE1-NEXT: xorl %r11d, %eax -; CHECK-SSE1-NEXT: movzwl 28(%rdx), %r11d -; CHECK-SSE1-NEXT: movzwl 28(%rsi), %ebp -; CHECK-SSE1-NEXT: xorw %r11w, %bp -; CHECK-SSE1-NEXT: andw 28(%rcx), %bp -; CHECK-SSE1-NEXT: xorl %ebp, %r11d +; CHECK-SSE1-NEXT: movzwl 26(%rsi), %r10d +; CHECK-SSE1-NEXT: xorw %ax, %r10w +; CHECK-SSE1-NEXT: andw 26(%rcx), %r10w +; CHECK-SSE1-NEXT: xorl %r10d, %eax +; CHECK-SSE1-NEXT: movzwl 28(%rdx), %r10d +; CHECK-SSE1-NEXT: movzwl 28(%rsi), %r11d +; CHECK-SSE1-NEXT: xorw %r10w, %r11w +; CHECK-SSE1-NEXT: andw 28(%rcx), %r11w +; CHECK-SSE1-NEXT: xorl %r11d, %r10d ; CHECK-SSE1-NEXT: movzwl 30(%rdx), %edx ; CHECK-SSE1-NEXT: movzwl 30(%rsi), %esi ; CHECK-SSE1-NEXT: xorw %dx, %si ; CHECK-SSE1-NEXT: andw 30(%rcx), %si ; CHECK-SSE1-NEXT: xorl %esi, %edx ; CHECK-SSE1-NEXT: movw %dx, 30(%rdi) -; CHECK-SSE1-NEXT: movw %r11w, 28(%rdi) +; CHECK-SSE1-NEXT: movw %r10w, 28(%rdi) ; CHECK-SSE1-NEXT: movw %ax, 26(%rdi) ; CHECK-SSE1-NEXT: movw %r8w, 24(%rdi) ; CHECK-SSE1-NEXT: movw %r9w, 22(%rdi) -; CHECK-SSE1-NEXT: movw %bx, 20(%rdi) +; CHECK-SSE1-NEXT: movw %r13w, 20(%rdi) ; CHECK-SSE1-NEXT: movw %r15w, 18(%rdi) ; CHECK-SSE1-NEXT: movw %r14w, 16(%rdi) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 14(%rdi) -; CHECK-SSE1-NEXT: movw %r13w, 12(%rdi) +; CHECK-SSE1-NEXT: movw %bp, 14(%rdi) +; CHECK-SSE1-NEXT: movw %bx, 12(%rdi) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-SSE1-NEXT: movw %ax, 10(%rdi) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-SSE1-NEXT: movw %ax, 8(%rdi) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-SSE1-NEXT: movw %ax, 6(%rdi) -; CHECK-SSE1-NEXT: movw %r12w, 4(%rdi) -; CHECK-SSE1-NEXT: movw %r10w, 2(%rdi) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-SSE1-NEXT: movw %ax, 4(%rdi) +; CHECK-SSE1-NEXT: movw %r12w, 2(%rdi) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-SSE1-NEXT: movw %ax, (%rdi) ; CHECK-SSE1-NEXT: movq %rdi, %rax @@ -2031,126 +2031,118 @@ ; CHECK-BASELINE-LABEL: out_v8i32: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: pushq %rbp -; CHECK-BASELINE-NEXT: pushq %r15 ; CHECK-BASELINE-NEXT: pushq %r14 -; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movl 28(%rdx), %r8d -; CHECK-BASELINE-NEXT: movl 24(%rdx), %r9d +; CHECK-BASELINE-NEXT: movl 28(%rdx), %edi +; CHECK-BASELINE-NEXT: movl 24(%rdx), %r8d ; CHECK-BASELINE-NEXT: movl 20(%rdx), %r10d -; CHECK-BASELINE-NEXT: movl 16(%rdx), %r14d -; CHECK-BASELINE-NEXT: movl 12(%rdx), %ebx +; CHECK-BASELINE-NEXT: movl 16(%rdx), %ebx +; CHECK-BASELINE-NEXT: movl 12(%rdx), %r14d ; CHECK-BASELINE-NEXT: movl 8(%rdx), %ebp -; CHECK-BASELINE-NEXT: movl (%rdx), %edi -; CHECK-BASELINE-NEXT: movl 4(%rdx), %edx -; CHECK-BASELINE-NEXT: movl (%rsi), %r11d -; CHECK-BASELINE-NEXT: xorl %edi, %r11d -; CHECK-BASELINE-NEXT: andl (%rcx), %r11d -; CHECK-BASELINE-NEXT: xorl %edi, %r11d -; CHECK-BASELINE-NEXT: movl 4(%rsi), %r15d -; CHECK-BASELINE-NEXT: xorl %edx, %r15d -; CHECK-BASELINE-NEXT: andl 4(%rcx), %r15d -; CHECK-BASELINE-NEXT: xorl %edx, %r15d -; CHECK-BASELINE-NEXT: movl 8(%rsi), %r12d -; CHECK-BASELINE-NEXT: xorl %ebp, %r12d -; CHECK-BASELINE-NEXT: andl 8(%rcx), %r12d -; CHECK-BASELINE-NEXT: xorl %ebp, %r12d -; CHECK-BASELINE-NEXT: movl 12(%rsi), %ebp -; CHECK-BASELINE-NEXT: xorl %ebx, %ebp -; CHECK-BASELINE-NEXT: andl 12(%rcx), %ebp -; CHECK-BASELINE-NEXT: xorl %ebx, %ebp -; CHECK-BASELINE-NEXT: movl 16(%rsi), %ebx -; CHECK-BASELINE-NEXT: xorl %r14d, %ebx -; CHECK-BASELINE-NEXT: andl 16(%rcx), %ebx -; CHECK-BASELINE-NEXT: xorl %r14d, %ebx -; CHECK-BASELINE-NEXT: movl 20(%rsi), %edi -; CHECK-BASELINE-NEXT: xorl %r10d, %edi -; CHECK-BASELINE-NEXT: andl 20(%rcx), %edi -; CHECK-BASELINE-NEXT: xorl %r10d, %edi -; CHECK-BASELINE-NEXT: movl 24(%rsi), %edx +; CHECK-BASELINE-NEXT: movl (%rdx), %r9d +; CHECK-BASELINE-NEXT: movl 4(%rdx), %r11d +; CHECK-BASELINE-NEXT: movl (%rsi), %edx ; CHECK-BASELINE-NEXT: xorl %r9d, %edx -; CHECK-BASELINE-NEXT: andl 24(%rcx), %edx +; CHECK-BASELINE-NEXT: andl (%rcx), %edx ; CHECK-BASELINE-NEXT: xorl %r9d, %edx +; CHECK-BASELINE-NEXT: movl 4(%rsi), %r9d +; CHECK-BASELINE-NEXT: xorl %r11d, %r9d +; CHECK-BASELINE-NEXT: andl 4(%rcx), %r9d +; CHECK-BASELINE-NEXT: xorl %r11d, %r9d +; CHECK-BASELINE-NEXT: movl 8(%rsi), %r11d +; CHECK-BASELINE-NEXT: xorl %ebp, %r11d +; CHECK-BASELINE-NEXT: andl 8(%rcx), %r11d +; CHECK-BASELINE-NEXT: xorl %ebp, %r11d +; CHECK-BASELINE-NEXT: movl 12(%rsi), %ebp +; CHECK-BASELINE-NEXT: xorl %r14d, %ebp +; CHECK-BASELINE-NEXT: andl 12(%rcx), %ebp +; CHECK-BASELINE-NEXT: xorl %r14d, %ebp +; CHECK-BASELINE-NEXT: movl 16(%rsi), %r14d +; CHECK-BASELINE-NEXT: xorl %ebx, %r14d +; CHECK-BASELINE-NEXT: andl 16(%rcx), %r14d +; CHECK-BASELINE-NEXT: xorl %ebx, %r14d +; CHECK-BASELINE-NEXT: movl 20(%rsi), %ebx +; CHECK-BASELINE-NEXT: xorl %r10d, %ebx +; CHECK-BASELINE-NEXT: andl 20(%rcx), %ebx +; CHECK-BASELINE-NEXT: xorl %r10d, %ebx +; CHECK-BASELINE-NEXT: movl 24(%rsi), %r10d +; CHECK-BASELINE-NEXT: xorl %r8d, %r10d +; CHECK-BASELINE-NEXT: andl 24(%rcx), %r10d +; CHECK-BASELINE-NEXT: xorl %r8d, %r10d ; CHECK-BASELINE-NEXT: movl 28(%rsi), %esi -; CHECK-BASELINE-NEXT: xorl %r8d, %esi +; CHECK-BASELINE-NEXT: xorl %edi, %esi ; CHECK-BASELINE-NEXT: andl 28(%rcx), %esi -; CHECK-BASELINE-NEXT: xorl %r8d, %esi +; CHECK-BASELINE-NEXT: xorl %edi, %esi ; CHECK-BASELINE-NEXT: movl %esi, 28(%rax) -; CHECK-BASELINE-NEXT: movl %edx, 24(%rax) -; CHECK-BASELINE-NEXT: movl %edi, 20(%rax) -; CHECK-BASELINE-NEXT: movl %ebx, 16(%rax) +; CHECK-BASELINE-NEXT: movl %r10d, 24(%rax) +; CHECK-BASELINE-NEXT: movl %ebx, 20(%rax) +; CHECK-BASELINE-NEXT: movl %r14d, 16(%rax) ; CHECK-BASELINE-NEXT: movl %ebp, 12(%rax) -; CHECK-BASELINE-NEXT: movl %r12d, 8(%rax) -; CHECK-BASELINE-NEXT: movl %r15d, 4(%rax) -; CHECK-BASELINE-NEXT: movl %r11d, (%rax) +; CHECK-BASELINE-NEXT: movl %r11d, 8(%rax) +; CHECK-BASELINE-NEXT: movl %r9d, 4(%rax) +; CHECK-BASELINE-NEXT: movl %edx, (%rax) ; CHECK-BASELINE-NEXT: popq %rbx -; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r14 -; CHECK-BASELINE-NEXT: popq %r15 ; CHECK-BASELINE-NEXT: popq %rbp ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v8i32: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: pushq %rbp -; CHECK-SSE1-NEXT: pushq %r15 ; CHECK-SSE1-NEXT: pushq %r14 -; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movl 28(%rdx), %r8d -; CHECK-SSE1-NEXT: movl 24(%rdx), %r9d +; CHECK-SSE1-NEXT: movl 28(%rdx), %edi +; CHECK-SSE1-NEXT: movl 24(%rdx), %r8d ; CHECK-SSE1-NEXT: movl 20(%rdx), %r10d -; CHECK-SSE1-NEXT: movl 16(%rdx), %r14d -; CHECK-SSE1-NEXT: movl 12(%rdx), %ebx +; CHECK-SSE1-NEXT: movl 16(%rdx), %ebx +; CHECK-SSE1-NEXT: movl 12(%rdx), %r14d ; CHECK-SSE1-NEXT: movl 8(%rdx), %ebp -; CHECK-SSE1-NEXT: movl (%rdx), %edi -; CHECK-SSE1-NEXT: movl 4(%rdx), %edx -; CHECK-SSE1-NEXT: movl (%rsi), %r11d -; CHECK-SSE1-NEXT: xorl %edi, %r11d -; CHECK-SSE1-NEXT: andl (%rcx), %r11d -; CHECK-SSE1-NEXT: xorl %edi, %r11d -; CHECK-SSE1-NEXT: movl 4(%rsi), %r15d -; CHECK-SSE1-NEXT: xorl %edx, %r15d -; CHECK-SSE1-NEXT: andl 4(%rcx), %r15d -; CHECK-SSE1-NEXT: xorl %edx, %r15d -; CHECK-SSE1-NEXT: movl 8(%rsi), %r12d -; CHECK-SSE1-NEXT: xorl %ebp, %r12d -; CHECK-SSE1-NEXT: andl 8(%rcx), %r12d -; CHECK-SSE1-NEXT: xorl %ebp, %r12d -; CHECK-SSE1-NEXT: movl 12(%rsi), %ebp -; CHECK-SSE1-NEXT: xorl %ebx, %ebp -; CHECK-SSE1-NEXT: andl 12(%rcx), %ebp -; CHECK-SSE1-NEXT: xorl %ebx, %ebp -; CHECK-SSE1-NEXT: movl 16(%rsi), %ebx -; CHECK-SSE1-NEXT: xorl %r14d, %ebx -; CHECK-SSE1-NEXT: andl 16(%rcx), %ebx -; CHECK-SSE1-NEXT: xorl %r14d, %ebx -; CHECK-SSE1-NEXT: movl 20(%rsi), %edi -; CHECK-SSE1-NEXT: xorl %r10d, %edi -; CHECK-SSE1-NEXT: andl 20(%rcx), %edi -; CHECK-SSE1-NEXT: xorl %r10d, %edi -; CHECK-SSE1-NEXT: movl 24(%rsi), %edx +; CHECK-SSE1-NEXT: movl (%rdx), %r9d +; CHECK-SSE1-NEXT: movl 4(%rdx), %r11d +; CHECK-SSE1-NEXT: movl (%rsi), %edx ; CHECK-SSE1-NEXT: xorl %r9d, %edx -; CHECK-SSE1-NEXT: andl 24(%rcx), %edx +; CHECK-SSE1-NEXT: andl (%rcx), %edx ; CHECK-SSE1-NEXT: xorl %r9d, %edx +; CHECK-SSE1-NEXT: movl 4(%rsi), %r9d +; CHECK-SSE1-NEXT: xorl %r11d, %r9d +; CHECK-SSE1-NEXT: andl 4(%rcx), %r9d +; CHECK-SSE1-NEXT: xorl %r11d, %r9d +; CHECK-SSE1-NEXT: movl 8(%rsi), %r11d +; CHECK-SSE1-NEXT: xorl %ebp, %r11d +; CHECK-SSE1-NEXT: andl 8(%rcx), %r11d +; CHECK-SSE1-NEXT: xorl %ebp, %r11d +; CHECK-SSE1-NEXT: movl 12(%rsi), %ebp +; CHECK-SSE1-NEXT: xorl %r14d, %ebp +; CHECK-SSE1-NEXT: andl 12(%rcx), %ebp +; CHECK-SSE1-NEXT: xorl %r14d, %ebp +; CHECK-SSE1-NEXT: movl 16(%rsi), %r14d +; CHECK-SSE1-NEXT: xorl %ebx, %r14d +; CHECK-SSE1-NEXT: andl 16(%rcx), %r14d +; CHECK-SSE1-NEXT: xorl %ebx, %r14d +; CHECK-SSE1-NEXT: movl 20(%rsi), %ebx +; CHECK-SSE1-NEXT: xorl %r10d, %ebx +; CHECK-SSE1-NEXT: andl 20(%rcx), %ebx +; CHECK-SSE1-NEXT: xorl %r10d, %ebx +; CHECK-SSE1-NEXT: movl 24(%rsi), %r10d +; CHECK-SSE1-NEXT: xorl %r8d, %r10d +; CHECK-SSE1-NEXT: andl 24(%rcx), %r10d +; CHECK-SSE1-NEXT: xorl %r8d, %r10d ; CHECK-SSE1-NEXT: movl 28(%rsi), %esi -; CHECK-SSE1-NEXT: xorl %r8d, %esi +; CHECK-SSE1-NEXT: xorl %edi, %esi ; CHECK-SSE1-NEXT: andl 28(%rcx), %esi -; CHECK-SSE1-NEXT: xorl %r8d, %esi +; CHECK-SSE1-NEXT: xorl %edi, %esi ; CHECK-SSE1-NEXT: movl %esi, 28(%rax) -; CHECK-SSE1-NEXT: movl %edx, 24(%rax) -; CHECK-SSE1-NEXT: movl %edi, 20(%rax) -; CHECK-SSE1-NEXT: movl %ebx, 16(%rax) +; CHECK-SSE1-NEXT: movl %r10d, 24(%rax) +; CHECK-SSE1-NEXT: movl %ebx, 20(%rax) +; CHECK-SSE1-NEXT: movl %r14d, 16(%rax) ; CHECK-SSE1-NEXT: movl %ebp, 12(%rax) -; CHECK-SSE1-NEXT: movl %r12d, 8(%rax) -; CHECK-SSE1-NEXT: movl %r15d, 4(%rax) -; CHECK-SSE1-NEXT: movl %r11d, (%rax) +; CHECK-SSE1-NEXT: movl %r11d, 8(%rax) +; CHECK-SSE1-NEXT: movl %r9d, 4(%rax) +; CHECK-SSE1-NEXT: movl %edx, (%rax) ; CHECK-SSE1-NEXT: popq %rbx -; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r14 -; CHECK-SSE1-NEXT: popq %r15 ; CHECK-SSE1-NEXT: popq %rbp ; CHECK-SSE1-NEXT: retq ; @@ -2188,59 +2180,59 @@ ; CHECK-BASELINE-LABEL: out_v4i64: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movq 24(%rdx), %r8 -; CHECK-BASELINE-NEXT: movq 16(%rdx), %r9 -; CHECK-BASELINE-NEXT: movq (%rdx), %rdi +; CHECK-BASELINE-NEXT: movq 24(%rdx), %rdi +; CHECK-BASELINE-NEXT: movq 16(%rdx), %r8 +; CHECK-BASELINE-NEXT: movq (%rdx), %r9 ; CHECK-BASELINE-NEXT: movq 8(%rdx), %r10 -; CHECK-BASELINE-NEXT: movq (%rsi), %r11 -; CHECK-BASELINE-NEXT: xorq %rdi, %r11 -; CHECK-BASELINE-NEXT: andq (%rcx), %r11 -; CHECK-BASELINE-NEXT: xorq %rdi, %r11 -; CHECK-BASELINE-NEXT: movq 8(%rsi), %rdi -; CHECK-BASELINE-NEXT: xorq %r10, %rdi -; CHECK-BASELINE-NEXT: andq 8(%rcx), %rdi -; CHECK-BASELINE-NEXT: xorq %r10, %rdi -; CHECK-BASELINE-NEXT: movq 16(%rsi), %rdx +; CHECK-BASELINE-NEXT: movq (%rsi), %rdx ; CHECK-BASELINE-NEXT: xorq %r9, %rdx -; CHECK-BASELINE-NEXT: andq 16(%rcx), %rdx +; CHECK-BASELINE-NEXT: andq (%rcx), %rdx ; CHECK-BASELINE-NEXT: xorq %r9, %rdx +; CHECK-BASELINE-NEXT: movq 8(%rsi), %r9 +; CHECK-BASELINE-NEXT: xorq %r10, %r9 +; CHECK-BASELINE-NEXT: andq 8(%rcx), %r9 +; CHECK-BASELINE-NEXT: xorq %r10, %r9 +; CHECK-BASELINE-NEXT: movq 16(%rsi), %r10 +; CHECK-BASELINE-NEXT: xorq %r8, %r10 +; CHECK-BASELINE-NEXT: andq 16(%rcx), %r10 +; CHECK-BASELINE-NEXT: xorq %r8, %r10 ; CHECK-BASELINE-NEXT: movq 24(%rsi), %rsi -; CHECK-BASELINE-NEXT: xorq %r8, %rsi +; CHECK-BASELINE-NEXT: xorq %rdi, %rsi ; CHECK-BASELINE-NEXT: andq 24(%rcx), %rsi -; CHECK-BASELINE-NEXT: xorq %r8, %rsi +; CHECK-BASELINE-NEXT: xorq %rdi, %rsi ; CHECK-BASELINE-NEXT: movq %rsi, 24(%rax) -; CHECK-BASELINE-NEXT: movq %rdx, 16(%rax) -; CHECK-BASELINE-NEXT: movq %rdi, 8(%rax) -; CHECK-BASELINE-NEXT: movq %r11, (%rax) +; CHECK-BASELINE-NEXT: movq %r10, 16(%rax) +; CHECK-BASELINE-NEXT: movq %r9, 8(%rax) +; CHECK-BASELINE-NEXT: movq %rdx, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i64: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movq 24(%rdx), %r8 -; CHECK-SSE1-NEXT: movq 16(%rdx), %r9 -; CHECK-SSE1-NEXT: movq (%rdx), %rdi +; CHECK-SSE1-NEXT: movq 24(%rdx), %rdi +; CHECK-SSE1-NEXT: movq 16(%rdx), %r8 +; CHECK-SSE1-NEXT: movq (%rdx), %r9 ; CHECK-SSE1-NEXT: movq 8(%rdx), %r10 -; CHECK-SSE1-NEXT: movq (%rsi), %r11 -; CHECK-SSE1-NEXT: xorq %rdi, %r11 -; CHECK-SSE1-NEXT: andq (%rcx), %r11 -; CHECK-SSE1-NEXT: xorq %rdi, %r11 -; CHECK-SSE1-NEXT: movq 8(%rsi), %rdi -; CHECK-SSE1-NEXT: xorq %r10, %rdi -; CHECK-SSE1-NEXT: andq 8(%rcx), %rdi -; CHECK-SSE1-NEXT: xorq %r10, %rdi -; CHECK-SSE1-NEXT: movq 16(%rsi), %rdx +; CHECK-SSE1-NEXT: movq (%rsi), %rdx ; CHECK-SSE1-NEXT: xorq %r9, %rdx -; CHECK-SSE1-NEXT: andq 16(%rcx), %rdx +; CHECK-SSE1-NEXT: andq (%rcx), %rdx ; CHECK-SSE1-NEXT: xorq %r9, %rdx +; CHECK-SSE1-NEXT: movq 8(%rsi), %r9 +; CHECK-SSE1-NEXT: xorq %r10, %r9 +; CHECK-SSE1-NEXT: andq 8(%rcx), %r9 +; CHECK-SSE1-NEXT: xorq %r10, %r9 +; CHECK-SSE1-NEXT: movq 16(%rsi), %r10 +; CHECK-SSE1-NEXT: xorq %r8, %r10 +; CHECK-SSE1-NEXT: andq 16(%rcx), %r10 +; CHECK-SSE1-NEXT: xorq %r8, %r10 ; CHECK-SSE1-NEXT: movq 24(%rsi), %rsi -; CHECK-SSE1-NEXT: xorq %r8, %rsi +; CHECK-SSE1-NEXT: xorq %rdi, %rsi ; CHECK-SSE1-NEXT: andq 24(%rcx), %rsi -; CHECK-SSE1-NEXT: xorq %r8, %rsi +; CHECK-SSE1-NEXT: xorq %rdi, %rsi ; CHECK-SSE1-NEXT: movq %rsi, 24(%rax) -; CHECK-SSE1-NEXT: movq %rdx, 16(%rax) -; CHECK-SSE1-NEXT: movq %rdi, 8(%rax) -; CHECK-SSE1-NEXT: movq %r11, (%rax) +; CHECK-SSE1-NEXT: movq %r10, 16(%rax) +; CHECK-SSE1-NEXT: movq %r9, 8(%rax) +; CHECK-SSE1-NEXT: movq %rdx, (%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i64: @@ -2501,20 +2493,20 @@ ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-BASELINE-NEXT: xorb %r11b, %sil ; CHECK-BASELINE-NEXT: xorb %r12b, %dl -; CHECK-BASELINE-NEXT: xorb %r15b, %cl -; CHECK-BASELINE-NEXT: xorb %r14b, %r8b -; CHECK-BASELINE-NEXT: xorb %bpl, %r9b +; CHECK-BASELINE-NEXT: xorb %r14b, %cl +; CHECK-BASELINE-NEXT: xorb %bpl, %r8b +; CHECK-BASELINE-NEXT: xorb %bl, %r9b +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r15b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d ; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r13b -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %bl ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-BASELINE-NEXT: xorb %r10b, %al ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b @@ -2523,19 +2515,19 @@ ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b ; CHECK-BASELINE-NEXT: xorb %r11b, %sil ; CHECK-BASELINE-NEXT: xorb %r12b, %dl -; CHECK-BASELINE-NEXT: xorb %r15b, %cl -; CHECK-BASELINE-NEXT: xorb %r14b, %r8b -; CHECK-BASELINE-NEXT: xorb %bpl, %r9b +; CHECK-BASELINE-NEXT: xorb %r14b, %cl +; CHECK-BASELINE-NEXT: xorb %bpl, %r8b +; CHECK-BASELINE-NEXT: xorb %bl, %r9b +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r15b ; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r13b -; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %bl ; CHECK-BASELINE-NEXT: xorb %r10b, %al ; CHECK-BASELINE-NEXT: movb %al, 7(%rdi) -; CHECK-BASELINE-NEXT: movb %bl, 6(%rdi) -; CHECK-BASELINE-NEXT: movb %r13b, 5(%rdi) +; CHECK-BASELINE-NEXT: movb %r13b, 6(%rdi) +; CHECK-BASELINE-NEXT: movb %r15b, 5(%rdi) ; CHECK-BASELINE-NEXT: movb %r9b, 4(%rdi) ; CHECK-BASELINE-NEXT: movb %r8b, 3(%rdi) ; CHECK-BASELINE-NEXT: movb %cl, 2(%rdi) @@ -2559,20 +2551,20 @@ ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-SSE1-NEXT: xorb %r11b, %sil ; CHECK-SSE1-NEXT: xorb %r12b, %dl -; CHECK-SSE1-NEXT: xorb %r15b, %cl -; CHECK-SSE1-NEXT: xorb %r14b, %r8b -; CHECK-SSE1-NEXT: xorb %bpl, %r9b +; CHECK-SSE1-NEXT: xorb %r14b, %cl +; CHECK-SSE1-NEXT: xorb %bpl, %r8b +; CHECK-SSE1-NEXT: xorb %bl, %r9b +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r15b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d ; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r13b -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %bl ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-SSE1-NEXT: xorb %r10b, %al ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b @@ -2581,19 +2573,19 @@ ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b ; CHECK-SSE1-NEXT: xorb %r11b, %sil ; CHECK-SSE1-NEXT: xorb %r12b, %dl -; CHECK-SSE1-NEXT: xorb %r15b, %cl -; CHECK-SSE1-NEXT: xorb %r14b, %r8b -; CHECK-SSE1-NEXT: xorb %bpl, %r9b +; CHECK-SSE1-NEXT: xorb %r14b, %cl +; CHECK-SSE1-NEXT: xorb %bpl, %r8b +; CHECK-SSE1-NEXT: xorb %bl, %r9b +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r15b ; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r13b -; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %bl ; CHECK-SSE1-NEXT: xorb %r10b, %al ; CHECK-SSE1-NEXT: movb %al, 7(%rdi) -; CHECK-SSE1-NEXT: movb %bl, 6(%rdi) -; CHECK-SSE1-NEXT: movb %r13b, 5(%rdi) +; CHECK-SSE1-NEXT: movb %r13b, 6(%rdi) +; CHECK-SSE1-NEXT: movb %r15b, 5(%rdi) ; CHECK-SSE1-NEXT: movb %r9b, 4(%rdi) ; CHECK-SSE1-NEXT: movb %r8b, 3(%rdi) ; CHECK-SSE1-NEXT: movb %cl, 2(%rdi) @@ -2629,21 +2621,21 @@ ; CHECK-BASELINE-LABEL: in_v4i16: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi ; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi ; CHECK-BASELINE-NEXT: xorl %r9d, %esi -; CHECK-BASELINE-NEXT: xorl %edi, %edx -; CHECK-BASELINE-NEXT: xorl %r11d, %ecx -; CHECK-BASELINE-NEXT: xorl %r10d, %r8d +; CHECK-BASELINE-NEXT: xorl %r11d, %edx +; CHECK-BASELINE-NEXT: xorl %r10d, %ecx +; CHECK-BASELINE-NEXT: xorl %edi, %r8d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si ; CHECK-BASELINE-NEXT: xorl %r9d, %esi -; CHECK-BASELINE-NEXT: xorl %edi, %edx -; CHECK-BASELINE-NEXT: xorl %r11d, %ecx -; CHECK-BASELINE-NEXT: xorl %r10d, %r8d +; CHECK-BASELINE-NEXT: xorl %r11d, %edx +; CHECK-BASELINE-NEXT: xorl %r10d, %ecx +; CHECK-BASELINE-NEXT: xorl %edi, %r8d ; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) ; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) ; CHECK-BASELINE-NEXT: movw %dx, 2(%rax) @@ -2653,21 +2645,21 @@ ; CHECK-SSE1-LABEL: in_v4i16: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi ; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi ; CHECK-SSE1-NEXT: xorl %r9d, %esi -; CHECK-SSE1-NEXT: xorl %edi, %edx -; CHECK-SSE1-NEXT: xorl %r11d, %ecx -; CHECK-SSE1-NEXT: xorl %r10d, %r8d +; CHECK-SSE1-NEXT: xorl %r11d, %edx +; CHECK-SSE1-NEXT: xorl %r10d, %ecx +; CHECK-SSE1-NEXT: xorl %edi, %r8d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si ; CHECK-SSE1-NEXT: xorl %r9d, %esi -; CHECK-SSE1-NEXT: xorl %edi, %edx -; CHECK-SSE1-NEXT: xorl %r11d, %ecx -; CHECK-SSE1-NEXT: xorl %r10d, %r8d +; CHECK-SSE1-NEXT: xorl %r11d, %edx +; CHECK-SSE1-NEXT: xorl %r10d, %ecx +; CHECK-SSE1-NEXT: xorl %edi, %r8d ; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) ; CHECK-SSE1-NEXT: movw %cx, 4(%rax) ; CHECK-SSE1-NEXT: movw %dx, 2(%rax) @@ -2767,12 +2759,12 @@ ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %esi ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edi @@ -2788,13 +2780,9 @@ ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b ; CHECK-BASELINE-NEXT: xorb %r11b, %r10b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: xorb %bl, %r11b +; CHECK-BASELINE-NEXT: xorb %r13b, %r11b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-BASELINE-NEXT: xorb %bl, %r11b -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: xorb %r13b, %bl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-BASELINE-NEXT: xorb %r13b, %bl +; CHECK-BASELINE-NEXT: xorb %r13b, %r11b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d ; CHECK-BASELINE-NEXT: xorb %r12b, %r13b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b @@ -2812,9 +2800,13 @@ ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b ; CHECK-BASELINE-NEXT: xorb %bpl, %r14b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; CHECK-BASELINE-NEXT: xorb %al, %bpl +; CHECK-BASELINE-NEXT: xorb %bl, %bpl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl -; CHECK-BASELINE-NEXT: xorb %al, %bpl +; CHECK-BASELINE-NEXT: xorb %bl, %bpl +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: xorb %al, %bl ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al @@ -2825,12 +2817,12 @@ ; CHECK-BASELINE-NEXT: xorb %sil, %cl ; CHECK-BASELINE-NEXT: movb %cl, 15(%rdx) ; CHECK-BASELINE-NEXT: movb %al, 14(%rdx) -; CHECK-BASELINE-NEXT: movb %bpl, 13(%rdx) -; CHECK-BASELINE-NEXT: movb %r14b, 12(%rdx) -; CHECK-BASELINE-NEXT: movb %r15b, 11(%rdx) -; CHECK-BASELINE-NEXT: movb %r12b, 10(%rdx) -; CHECK-BASELINE-NEXT: movb %r13b, 9(%rdx) -; CHECK-BASELINE-NEXT: movb %bl, 8(%rdx) +; CHECK-BASELINE-NEXT: movb %bl, 13(%rdx) +; CHECK-BASELINE-NEXT: movb %bpl, 12(%rdx) +; CHECK-BASELINE-NEXT: movb %r14b, 11(%rdx) +; CHECK-BASELINE-NEXT: movb %r15b, 10(%rdx) +; CHECK-BASELINE-NEXT: movb %r12b, 9(%rdx) +; CHECK-BASELINE-NEXT: movb %r13b, 8(%rdx) ; CHECK-BASELINE-NEXT: movb %r11b, 7(%rdx) ; CHECK-BASELINE-NEXT: movb %r10b, 6(%rdx) ; CHECK-BASELINE-NEXT: movb %dil, 5(%rdx) @@ -2882,12 +2874,12 @@ ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi @@ -2903,13 +2895,9 @@ ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b ; CHECK-SSE1-NEXT: xorb %r11b, %r10b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: xorb %bl, %r11b +; CHECK-SSE1-NEXT: xorb %r13b, %r11b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-SSE1-NEXT: xorb %bl, %r11b -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: xorb %r13b, %bl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-SSE1-NEXT: xorb %r13b, %bl +; CHECK-SSE1-NEXT: xorb %r13b, %r11b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d ; CHECK-SSE1-NEXT: xorb %r12b, %r13b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b @@ -2927,9 +2915,13 @@ ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b ; CHECK-SSE1-NEXT: xorb %bpl, %r14b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; CHECK-SSE1-NEXT: xorb %al, %bpl +; CHECK-SSE1-NEXT: xorb %bl, %bpl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl -; CHECK-SSE1-NEXT: xorb %al, %bpl +; CHECK-SSE1-NEXT: xorb %bl, %bpl +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: xorb %al, %bl ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al @@ -2940,12 +2932,12 @@ ; CHECK-SSE1-NEXT: xorb %sil, %cl ; CHECK-SSE1-NEXT: movb %cl, 15(%rdx) ; CHECK-SSE1-NEXT: movb %al, 14(%rdx) -; CHECK-SSE1-NEXT: movb %bpl, 13(%rdx) -; CHECK-SSE1-NEXT: movb %r14b, 12(%rdx) -; CHECK-SSE1-NEXT: movb %r15b, 11(%rdx) -; CHECK-SSE1-NEXT: movb %r12b, 10(%rdx) -; CHECK-SSE1-NEXT: movb %r13b, 9(%rdx) -; CHECK-SSE1-NEXT: movb %bl, 8(%rdx) +; CHECK-SSE1-NEXT: movb %bl, 13(%rdx) +; CHECK-SSE1-NEXT: movb %bpl, 12(%rdx) +; CHECK-SSE1-NEXT: movb %r14b, 11(%rdx) +; CHECK-SSE1-NEXT: movb %r15b, 10(%rdx) +; CHECK-SSE1-NEXT: movb %r12b, 9(%rdx) +; CHECK-SSE1-NEXT: movb %r13b, 8(%rdx) ; CHECK-SSE1-NEXT: movb %r11b, 7(%rdx) ; CHECK-SSE1-NEXT: movb %r10b, 6(%rdx) ; CHECK-SSE1-NEXT: movb %dil, 5(%rdx) @@ -3002,12 +2994,11 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { ; CHECK-BASELINE-LABEL: in_v8i16: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: pushq %rbp ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi ; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi ; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx ; CHECK-BASELINE-NEXT: xorl %ebx, %esi ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si @@ -3028,38 +3019,36 @@ ; CHECK-BASELINE-NEXT: xorl %ebx, %r9d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r9w ; CHECK-BASELINE-NEXT: xorl %ebx, %r9d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp -; CHECK-BASELINE-NEXT: xorw %di, %bp -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bp -; CHECK-BASELINE-NEXT: xorl %edi, %ebp -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: xorw %r11w, %di -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %di -; CHECK-BASELINE-NEXT: xorl %r11d, %edi ; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: xorw %r10w, %bx +; CHECK-BASELINE-NEXT: xorw %r11w, %bx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bx -; CHECK-BASELINE-NEXT: xorl %r10d, %ebx -; CHECK-BASELINE-NEXT: movw %bx, 14(%rax) -; CHECK-BASELINE-NEXT: movw %di, 12(%rax) -; CHECK-BASELINE-NEXT: movw %bp, 10(%rax) +; CHECK-BASELINE-NEXT: xorl %r11d, %ebx +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: xorw %r10w, %r11w +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w +; CHECK-BASELINE-NEXT: xorl %r10d, %r11d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: xorw %di, %r10w +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-BASELINE-NEXT: xorl %edi, %r10d +; CHECK-BASELINE-NEXT: movw %r10w, 14(%rax) +; CHECK-BASELINE-NEXT: movw %r11w, 12(%rax) +; CHECK-BASELINE-NEXT: movw %bx, 10(%rax) ; CHECK-BASELINE-NEXT: movw %r9w, 8(%rax) ; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) ; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) ; CHECK-BASELINE-NEXT: movw %dx, 2(%rax) ; CHECK-BASELINE-NEXT: movw %si, (%rax) ; CHECK-BASELINE-NEXT: popq %rbx -; CHECK-BASELINE-NEXT: popq %rbp ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: in_v8i16: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: pushq %rbp ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi ; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi ; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx ; CHECK-SSE1-NEXT: xorl %ebx, %esi ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si @@ -3080,28 +3069,27 @@ ; CHECK-SSE1-NEXT: xorl %ebx, %r9d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r9w ; CHECK-SSE1-NEXT: xorl %ebx, %r9d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp -; CHECK-SSE1-NEXT: xorw %di, %bp -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bp -; CHECK-SSE1-NEXT: xorl %edi, %ebp -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: xorw %r11w, %di -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %di -; CHECK-SSE1-NEXT: xorl %r11d, %edi ; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: xorw %r10w, %bx +; CHECK-SSE1-NEXT: xorw %r11w, %bx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bx -; CHECK-SSE1-NEXT: xorl %r10d, %ebx -; CHECK-SSE1-NEXT: movw %bx, 14(%rax) -; CHECK-SSE1-NEXT: movw %di, 12(%rax) -; CHECK-SSE1-NEXT: movw %bp, 10(%rax) +; CHECK-SSE1-NEXT: xorl %r11d, %ebx +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: xorw %r10w, %r11w +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w +; CHECK-SSE1-NEXT: xorl %r10d, %r11d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: xorw %di, %r10w +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-SSE1-NEXT: xorl %edi, %r10d +; CHECK-SSE1-NEXT: movw %r10w, 14(%rax) +; CHECK-SSE1-NEXT: movw %r11w, 12(%rax) +; CHECK-SSE1-NEXT: movw %bx, 10(%rax) ; CHECK-SSE1-NEXT: movw %r9w, 8(%rax) ; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) ; CHECK-SSE1-NEXT: movw %cx, 4(%rax) ; CHECK-SSE1-NEXT: movw %dx, 2(%rax) ; CHECK-SSE1-NEXT: movw %si, (%rax) ; CHECK-SSE1-NEXT: popq %rbx -; CHECK-SSE1-NEXT: popq %rbp ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_v8i16: @@ -3126,29 +3114,29 @@ ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movl 12(%rdx), %r8d -; CHECK-BASELINE-NEXT: movl 8(%rdx), %r9d -; CHECK-BASELINE-NEXT: movl (%rdx), %r11d +; CHECK-BASELINE-NEXT: movl 12(%rdx), %edi +; CHECK-BASELINE-NEXT: movl 8(%rdx), %r8d +; CHECK-BASELINE-NEXT: movl (%rdx), %r9d ; CHECK-BASELINE-NEXT: movl 4(%rdx), %r10d ; CHECK-BASELINE-NEXT: movl (%rsi), %edx -; CHECK-BASELINE-NEXT: xorl %r11d, %edx -; CHECK-BASELINE-NEXT: movl 4(%rsi), %edi -; CHECK-BASELINE-NEXT: xorl %r10d, %edi +; CHECK-BASELINE-NEXT: xorl %r9d, %edx +; CHECK-BASELINE-NEXT: movl 4(%rsi), %r11d +; CHECK-BASELINE-NEXT: xorl %r10d, %r11d ; CHECK-BASELINE-NEXT: movl 8(%rsi), %ebx -; CHECK-BASELINE-NEXT: xorl %r9d, %ebx +; CHECK-BASELINE-NEXT: xorl %r8d, %ebx ; CHECK-BASELINE-NEXT: movl 12(%rsi), %esi -; CHECK-BASELINE-NEXT: xorl %r8d, %esi +; CHECK-BASELINE-NEXT: xorl %edi, %esi ; CHECK-BASELINE-NEXT: andl 12(%rcx), %esi ; CHECK-BASELINE-NEXT: andl 8(%rcx), %ebx -; CHECK-BASELINE-NEXT: andl 4(%rcx), %edi +; CHECK-BASELINE-NEXT: andl 4(%rcx), %r11d ; CHECK-BASELINE-NEXT: andl (%rcx), %edx -; CHECK-BASELINE-NEXT: xorl %r11d, %edx -; CHECK-BASELINE-NEXT: xorl %r10d, %edi -; CHECK-BASELINE-NEXT: xorl %r9d, %ebx -; CHECK-BASELINE-NEXT: xorl %r8d, %esi +; CHECK-BASELINE-NEXT: xorl %r9d, %edx +; CHECK-BASELINE-NEXT: xorl %r10d, %r11d +; CHECK-BASELINE-NEXT: xorl %r8d, %ebx +; CHECK-BASELINE-NEXT: xorl %edi, %esi ; CHECK-BASELINE-NEXT: movl %esi, 12(%rax) ; CHECK-BASELINE-NEXT: movl %ebx, 8(%rax) -; CHECK-BASELINE-NEXT: movl %edi, 4(%rax) +; CHECK-BASELINE-NEXT: movl %r11d, 4(%rax) ; CHECK-BASELINE-NEXT: movl %edx, (%rax) ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: retq @@ -3244,9 +3232,9 @@ ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdx, %r13 -; CHECK-BASELINE-NEXT: movq %rsi, %rbx +; CHECK-BASELINE-NEXT: movq %rsi, %r12 ; CHECK-BASELINE-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %r12d +; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %r15d ; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 13(%rdx), %eax @@ -3257,200 +3245,200 @@ ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 10(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r9d -; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r10d -; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r11d -; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r8d +; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r8d +; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r9d +; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r10d +; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r11d ; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %ebp -; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %esi -; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %edi -; CHECK-BASELINE-NEXT: movzbl 2(%rdx), %r14d -; CHECK-BASELINE-NEXT: movzbl (%rdx), %eax -; CHECK-BASELINE-NEXT: movzbl 1(%rdx), %r15d -; CHECK-BASELINE-NEXT: movzbl (%rbx), %edx -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb (%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 1(%rbx), %eax -; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %esi +; CHECK-BASELINE-NEXT: movzbl 2(%rdx), %edx +; CHECK-BASELINE-NEXT: movzbl (%r13), %eax +; CHECK-BASELINE-NEXT: movzbl 1(%r13), %ebx +; CHECK-BASELINE-NEXT: movzbl (%r12), %r14d +; CHECK-BASELINE-NEXT: xorb %al, %r14b +; CHECK-BASELINE-NEXT: andb (%rcx), %r14b +; CHECK-BASELINE-NEXT: xorb %al, %r14b +; CHECK-BASELINE-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 1(%r12), %eax +; CHECK-BASELINE-NEXT: xorb %bl, %al ; CHECK-BASELINE-NEXT: andb 1(%rcx), %al -; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: xorb %bl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 2(%rbx), %eax -; CHECK-BASELINE-NEXT: xorb %r14b, %al +; CHECK-BASELINE-NEXT: movzbl 2(%r12), %eax +; CHECK-BASELINE-NEXT: xorb %dl, %al ; CHECK-BASELINE-NEXT: andb 2(%rcx), %al -; CHECK-BASELINE-NEXT: xorb %r14b, %al +; CHECK-BASELINE-NEXT: xorb %dl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 3(%rbx), %eax -; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: movzbl 3(%r12), %eax +; CHECK-BASELINE-NEXT: xorb %sil, %al ; CHECK-BASELINE-NEXT: andb 3(%rcx), %al -; CHECK-BASELINE-NEXT: xorb %dil, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 4(%rbx), %eax ; CHECK-BASELINE-NEXT: xorb %sil, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 4(%r12), %eax +; CHECK-BASELINE-NEXT: xorb %dil, %al ; CHECK-BASELINE-NEXT: andb 4(%rcx), %al -; CHECK-BASELINE-NEXT: xorb %sil, %al +; CHECK-BASELINE-NEXT: xorb %dil, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 5(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 5(%r12), %eax ; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: andb 5(%rcx), %al ; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 6(%rbx), %eax -; CHECK-BASELINE-NEXT: xorb %r8b, %al -; CHECK-BASELINE-NEXT: andb 6(%rcx), %al -; CHECK-BASELINE-NEXT: xorb %r8b, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 7(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 6(%r12), %eax ; CHECK-BASELINE-NEXT: xorb %r11b, %al -; CHECK-BASELINE-NEXT: andb 7(%rcx), %al +; CHECK-BASELINE-NEXT: andb 6(%rcx), %al ; CHECK-BASELINE-NEXT: xorb %r11b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 8(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 7(%r12), %eax ; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: andb 8(%rcx), %al +; CHECK-BASELINE-NEXT: andb 7(%rcx), %al ; CHECK-BASELINE-NEXT: xorb %r10b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 9(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 8(%r12), %eax ; CHECK-BASELINE-NEXT: xorb %r9b, %al -; CHECK-BASELINE-NEXT: andb 9(%rcx), %al +; CHECK-BASELINE-NEXT: andb 8(%rcx), %al ; CHECK-BASELINE-NEXT: xorb %r9b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 10(%rbx), %edx +; CHECK-BASELINE-NEXT: movzbl 9(%r12), %eax +; CHECK-BASELINE-NEXT: xorb %r8b, %al +; CHECK-BASELINE-NEXT: andb 9(%rcx), %al +; CHECK-BASELINE-NEXT: xorb %r8b, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 10(%r12), %edx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: andb 10(%rcx), %dl ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 11(%rbx), %edx +; CHECK-BASELINE-NEXT: movzbl 11(%r12), %edx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: andb 11(%rcx), %dl ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 12(%rbx), %edx +; CHECK-BASELINE-NEXT: movzbl 12(%r12), %edx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: andb 12(%rcx), %dl ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 13(%rbx), %edx +; CHECK-BASELINE-NEXT: movzbl 13(%r12), %edx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: andb 13(%rcx), %dl ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 14(%rbx), %edx +; CHECK-BASELINE-NEXT: movzbl 14(%r12), %edx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: andb 14(%rcx), %dl ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 15(%rbx), %eax -; CHECK-BASELINE-NEXT: xorb %r12b, %al +; CHECK-BASELINE-NEXT: movzbl 15(%r12), %eax +; CHECK-BASELINE-NEXT: xorb %r15b, %al ; CHECK-BASELINE-NEXT: andb 15(%rcx), %al -; CHECK-BASELINE-NEXT: xorb %r12b, %al +; CHECK-BASELINE-NEXT: xorb %r15b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 16(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 16(%rbx), %edx +; CHECK-BASELINE-NEXT: movzbl 16(%r12), %edx ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: andb 16(%rcx), %dl ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 17(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 17(%rbx), %edx +; CHECK-BASELINE-NEXT: movzbl 17(%r12), %edx ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: andb 17(%rcx), %dl ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 18(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 18(%rbx), %edx +; CHECK-BASELINE-NEXT: movzbl 18(%r12), %edx ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: andb 18(%rcx), %dl ; CHECK-BASELINE-NEXT: xorb %al, %dl ; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 19(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 19(%rbx), %r12d -; CHECK-BASELINE-NEXT: xorb %al, %r12b -; CHECK-BASELINE-NEXT: andb 19(%rcx), %r12b -; CHECK-BASELINE-NEXT: xorb %al, %r12b -; CHECK-BASELINE-NEXT: movzbl 20(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 20(%rbx), %r15d +; CHECK-BASELINE-NEXT: movzbl 19(%r12), %r15d ; CHECK-BASELINE-NEXT: xorb %al, %r15b -; CHECK-BASELINE-NEXT: andb 20(%rcx), %r15b -; CHECK-BASELINE-NEXT: movq %rcx, %rsi +; CHECK-BASELINE-NEXT: andb 19(%rcx), %r15b +; CHECK-BASELINE-NEXT: movq %rcx, %rdx ; CHECK-BASELINE-NEXT: xorb %al, %r15b -; CHECK-BASELINE-NEXT: movzbl 21(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 21(%rbx), %r14d +; CHECK-BASELINE-NEXT: movzbl 20(%r13), %eax +; CHECK-BASELINE-NEXT: movzbl 20(%r12), %r14d ; CHECK-BASELINE-NEXT: xorb %al, %r14b -; CHECK-BASELINE-NEXT: andb 21(%rcx), %r14b +; CHECK-BASELINE-NEXT: andb 20(%rcx), %r14b ; CHECK-BASELINE-NEXT: xorb %al, %r14b -; CHECK-BASELINE-NEXT: movzbl 22(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 22(%rbx), %ebp +; CHECK-BASELINE-NEXT: movzbl 21(%r13), %eax +; CHECK-BASELINE-NEXT: movzbl 21(%r12), %ebp ; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: andb 22(%rcx), %bpl +; CHECK-BASELINE-NEXT: andb 21(%rcx), %bpl ; CHECK-BASELINE-NEXT: xorb %al, %bpl +; CHECK-BASELINE-NEXT: movzbl 22(%r13), %eax +; CHECK-BASELINE-NEXT: movzbl 22(%r12), %ebx +; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: andb 22(%rcx), %bl +; CHECK-BASELINE-NEXT: xorb %al, %bl ; CHECK-BASELINE-NEXT: movzbl 23(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 23(%rbx), %r11d +; CHECK-BASELINE-NEXT: movzbl 23(%r12), %r11d ; CHECK-BASELINE-NEXT: xorb %al, %r11b ; CHECK-BASELINE-NEXT: andb 23(%rcx), %r11b ; CHECK-BASELINE-NEXT: xorb %al, %r11b ; CHECK-BASELINE-NEXT: movzbl 24(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 24(%rbx), %r10d +; CHECK-BASELINE-NEXT: movzbl 24(%r12), %r10d ; CHECK-BASELINE-NEXT: xorb %al, %r10b ; CHECK-BASELINE-NEXT: andb 24(%rcx), %r10b ; CHECK-BASELINE-NEXT: xorb %al, %r10b ; CHECK-BASELINE-NEXT: movzbl 25(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 25(%rbx), %r9d +; CHECK-BASELINE-NEXT: movzbl 25(%r12), %r9d ; CHECK-BASELINE-NEXT: xorb %al, %r9b ; CHECK-BASELINE-NEXT: andb 25(%rcx), %r9b ; CHECK-BASELINE-NEXT: xorb %al, %r9b ; CHECK-BASELINE-NEXT: movzbl 26(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 26(%rbx), %r8d +; CHECK-BASELINE-NEXT: movzbl 26(%r12), %r8d ; CHECK-BASELINE-NEXT: xorb %al, %r8b ; CHECK-BASELINE-NEXT: andb 26(%rcx), %r8b ; CHECK-BASELINE-NEXT: xorb %al, %r8b ; CHECK-BASELINE-NEXT: movzbl 27(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 27(%rbx), %edi +; CHECK-BASELINE-NEXT: movzbl 27(%r12), %edi ; CHECK-BASELINE-NEXT: xorb %al, %dil ; CHECK-BASELINE-NEXT: andb 27(%rcx), %dil ; CHECK-BASELINE-NEXT: xorb %al, %dil ; CHECK-BASELINE-NEXT: movzbl 28(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 28(%rbx), %edx -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 28(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: movzbl 28(%r12), %esi +; CHECK-BASELINE-NEXT: xorb %al, %sil +; CHECK-BASELINE-NEXT: andb 28(%rcx), %sil +; CHECK-BASELINE-NEXT: xorb %al, %sil ; CHECK-BASELINE-NEXT: movzbl 29(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 29(%rbx), %ecx +; CHECK-BASELINE-NEXT: movzbl 29(%r12), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 29(%rsi), %cl +; CHECK-BASELINE-NEXT: andb 29(%rdx), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movzbl 30(%r13), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 30(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 30(%r12), %eax ; CHECK-BASELINE-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: andb 30(%rsi), %al +; CHECK-BASELINE-NEXT: andb 30(%rdx), %al ; CHECK-BASELINE-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: movzbl 31(%r13), %r13d -; CHECK-BASELINE-NEXT: movzbl 31(%rbx), %ebx -; CHECK-BASELINE-NEXT: xorb %r13b, %bl -; CHECK-BASELINE-NEXT: andb 31(%rsi), %bl -; CHECK-BASELINE-NEXT: xorb %r13b, %bl +; CHECK-BASELINE-NEXT: movzbl 31(%r12), %r12d +; CHECK-BASELINE-NEXT: xorb %r13b, %r12b +; CHECK-BASELINE-NEXT: andb 31(%rdx), %r12b +; CHECK-BASELINE-NEXT: xorb %r13b, %r12b ; CHECK-BASELINE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 31(%r13) +; CHECK-BASELINE-NEXT: movb %r12b, 31(%r13) ; CHECK-BASELINE-NEXT: movb %al, 30(%r13) ; CHECK-BASELINE-NEXT: movb %cl, 29(%r13) -; CHECK-BASELINE-NEXT: movb %dl, 28(%r13) +; CHECK-BASELINE-NEXT: movb %sil, 28(%r13) ; CHECK-BASELINE-NEXT: movb %dil, 27(%r13) ; CHECK-BASELINE-NEXT: movb %r8b, 26(%r13) ; CHECK-BASELINE-NEXT: movb %r9b, 25(%r13) ; CHECK-BASELINE-NEXT: movb %r10b, 24(%r13) ; CHECK-BASELINE-NEXT: movb %r11b, 23(%r13) -; CHECK-BASELINE-NEXT: movb %bpl, 22(%r13) -; CHECK-BASELINE-NEXT: movb %r14b, 21(%r13) -; CHECK-BASELINE-NEXT: movb %r15b, 20(%r13) -; CHECK-BASELINE-NEXT: movb %r12b, 19(%r13) +; CHECK-BASELINE-NEXT: movb %bl, 22(%r13) +; CHECK-BASELINE-NEXT: movb %bpl, 21(%r13) +; CHECK-BASELINE-NEXT: movb %r14b, 20(%r13) +; CHECK-BASELINE-NEXT: movb %r15b, 19(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: movb %al, 18(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload @@ -3507,9 +3495,9 @@ ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdx, %r13 -; CHECK-SSE1-NEXT: movq %rsi, %rbx +; CHECK-SSE1-NEXT: movq %rsi, %r12 ; CHECK-SSE1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-SSE1-NEXT: movzbl 15(%rdx), %r12d +; CHECK-SSE1-NEXT: movzbl 15(%rdx), %r15d ; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 13(%rdx), %eax @@ -3520,200 +3508,200 @@ ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 10(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r9d -; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r10d -; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r11d -; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r8d +; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r8d +; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r9d +; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r10d +; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r11d ; CHECK-SSE1-NEXT: movzbl 5(%rdx), %ebp -; CHECK-SSE1-NEXT: movzbl 4(%rdx), %esi -; CHECK-SSE1-NEXT: movzbl 3(%rdx), %edi -; CHECK-SSE1-NEXT: movzbl 2(%rdx), %r14d -; CHECK-SSE1-NEXT: movzbl (%rdx), %eax -; CHECK-SSE1-NEXT: movzbl 1(%rdx), %r15d -; CHECK-SSE1-NEXT: movzbl (%rbx), %edx -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb (%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 1(%rbx), %eax -; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: movzbl 4(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 3(%rdx), %esi +; CHECK-SSE1-NEXT: movzbl 2(%rdx), %edx +; CHECK-SSE1-NEXT: movzbl (%r13), %eax +; CHECK-SSE1-NEXT: movzbl 1(%r13), %ebx +; CHECK-SSE1-NEXT: movzbl (%r12), %r14d +; CHECK-SSE1-NEXT: xorb %al, %r14b +; CHECK-SSE1-NEXT: andb (%rcx), %r14b +; CHECK-SSE1-NEXT: xorb %al, %r14b +; CHECK-SSE1-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 1(%r12), %eax +; CHECK-SSE1-NEXT: xorb %bl, %al ; CHECK-SSE1-NEXT: andb 1(%rcx), %al -; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: xorb %bl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 2(%rbx), %eax -; CHECK-SSE1-NEXT: xorb %r14b, %al +; CHECK-SSE1-NEXT: movzbl 2(%r12), %eax +; CHECK-SSE1-NEXT: xorb %dl, %al ; CHECK-SSE1-NEXT: andb 2(%rcx), %al -; CHECK-SSE1-NEXT: xorb %r14b, %al +; CHECK-SSE1-NEXT: xorb %dl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 3(%rbx), %eax -; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: movzbl 3(%r12), %eax +; CHECK-SSE1-NEXT: xorb %sil, %al ; CHECK-SSE1-NEXT: andb 3(%rcx), %al -; CHECK-SSE1-NEXT: xorb %dil, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 4(%rbx), %eax ; CHECK-SSE1-NEXT: xorb %sil, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 4(%r12), %eax +; CHECK-SSE1-NEXT: xorb %dil, %al ; CHECK-SSE1-NEXT: andb 4(%rcx), %al -; CHECK-SSE1-NEXT: xorb %sil, %al +; CHECK-SSE1-NEXT: xorb %dil, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 5(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 5(%r12), %eax ; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: andb 5(%rcx), %al ; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 6(%rbx), %eax -; CHECK-SSE1-NEXT: xorb %r8b, %al -; CHECK-SSE1-NEXT: andb 6(%rcx), %al -; CHECK-SSE1-NEXT: xorb %r8b, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 7(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 6(%r12), %eax ; CHECK-SSE1-NEXT: xorb %r11b, %al -; CHECK-SSE1-NEXT: andb 7(%rcx), %al +; CHECK-SSE1-NEXT: andb 6(%rcx), %al ; CHECK-SSE1-NEXT: xorb %r11b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 8(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 7(%r12), %eax ; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: andb 8(%rcx), %al +; CHECK-SSE1-NEXT: andb 7(%rcx), %al ; CHECK-SSE1-NEXT: xorb %r10b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 9(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 8(%r12), %eax ; CHECK-SSE1-NEXT: xorb %r9b, %al -; CHECK-SSE1-NEXT: andb 9(%rcx), %al +; CHECK-SSE1-NEXT: andb 8(%rcx), %al ; CHECK-SSE1-NEXT: xorb %r9b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 10(%rbx), %edx +; CHECK-SSE1-NEXT: movzbl 9(%r12), %eax +; CHECK-SSE1-NEXT: xorb %r8b, %al +; CHECK-SSE1-NEXT: andb 9(%rcx), %al +; CHECK-SSE1-NEXT: xorb %r8b, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 10(%r12), %edx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: andb 10(%rcx), %dl ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 11(%rbx), %edx +; CHECK-SSE1-NEXT: movzbl 11(%r12), %edx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: andb 11(%rcx), %dl ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 12(%rbx), %edx +; CHECK-SSE1-NEXT: movzbl 12(%r12), %edx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: andb 12(%rcx), %dl ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 13(%rbx), %edx +; CHECK-SSE1-NEXT: movzbl 13(%r12), %edx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: andb 13(%rcx), %dl ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 14(%rbx), %edx +; CHECK-SSE1-NEXT: movzbl 14(%r12), %edx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: andb 14(%rcx), %dl ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 15(%rbx), %eax -; CHECK-SSE1-NEXT: xorb %r12b, %al +; CHECK-SSE1-NEXT: movzbl 15(%r12), %eax +; CHECK-SSE1-NEXT: xorb %r15b, %al ; CHECK-SSE1-NEXT: andb 15(%rcx), %al -; CHECK-SSE1-NEXT: xorb %r12b, %al +; CHECK-SSE1-NEXT: xorb %r15b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 16(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 16(%rbx), %edx +; CHECK-SSE1-NEXT: movzbl 16(%r12), %edx ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: andb 16(%rcx), %dl ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 17(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 17(%rbx), %edx +; CHECK-SSE1-NEXT: movzbl 17(%r12), %edx ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: andb 17(%rcx), %dl ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 18(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 18(%rbx), %edx +; CHECK-SSE1-NEXT: movzbl 18(%r12), %edx ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: andb 18(%rcx), %dl ; CHECK-SSE1-NEXT: xorb %al, %dl ; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 19(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 19(%rbx), %r12d -; CHECK-SSE1-NEXT: xorb %al, %r12b -; CHECK-SSE1-NEXT: andb 19(%rcx), %r12b -; CHECK-SSE1-NEXT: xorb %al, %r12b -; CHECK-SSE1-NEXT: movzbl 20(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 20(%rbx), %r15d +; CHECK-SSE1-NEXT: movzbl 19(%r12), %r15d ; CHECK-SSE1-NEXT: xorb %al, %r15b -; CHECK-SSE1-NEXT: andb 20(%rcx), %r15b -; CHECK-SSE1-NEXT: movq %rcx, %rsi +; CHECK-SSE1-NEXT: andb 19(%rcx), %r15b +; CHECK-SSE1-NEXT: movq %rcx, %rdx ; CHECK-SSE1-NEXT: xorb %al, %r15b -; CHECK-SSE1-NEXT: movzbl 21(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 21(%rbx), %r14d +; CHECK-SSE1-NEXT: movzbl 20(%r13), %eax +; CHECK-SSE1-NEXT: movzbl 20(%r12), %r14d ; CHECK-SSE1-NEXT: xorb %al, %r14b -; CHECK-SSE1-NEXT: andb 21(%rcx), %r14b +; CHECK-SSE1-NEXT: andb 20(%rcx), %r14b ; CHECK-SSE1-NEXT: xorb %al, %r14b -; CHECK-SSE1-NEXT: movzbl 22(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 22(%rbx), %ebp +; CHECK-SSE1-NEXT: movzbl 21(%r13), %eax +; CHECK-SSE1-NEXT: movzbl 21(%r12), %ebp ; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: andb 22(%rcx), %bpl +; CHECK-SSE1-NEXT: andb 21(%rcx), %bpl ; CHECK-SSE1-NEXT: xorb %al, %bpl +; CHECK-SSE1-NEXT: movzbl 22(%r13), %eax +; CHECK-SSE1-NEXT: movzbl 22(%r12), %ebx +; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: andb 22(%rcx), %bl +; CHECK-SSE1-NEXT: xorb %al, %bl ; CHECK-SSE1-NEXT: movzbl 23(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 23(%rbx), %r11d +; CHECK-SSE1-NEXT: movzbl 23(%r12), %r11d ; CHECK-SSE1-NEXT: xorb %al, %r11b ; CHECK-SSE1-NEXT: andb 23(%rcx), %r11b ; CHECK-SSE1-NEXT: xorb %al, %r11b ; CHECK-SSE1-NEXT: movzbl 24(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 24(%rbx), %r10d +; CHECK-SSE1-NEXT: movzbl 24(%r12), %r10d ; CHECK-SSE1-NEXT: xorb %al, %r10b ; CHECK-SSE1-NEXT: andb 24(%rcx), %r10b ; CHECK-SSE1-NEXT: xorb %al, %r10b ; CHECK-SSE1-NEXT: movzbl 25(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 25(%rbx), %r9d +; CHECK-SSE1-NEXT: movzbl 25(%r12), %r9d ; CHECK-SSE1-NEXT: xorb %al, %r9b ; CHECK-SSE1-NEXT: andb 25(%rcx), %r9b ; CHECK-SSE1-NEXT: xorb %al, %r9b ; CHECK-SSE1-NEXT: movzbl 26(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 26(%rbx), %r8d +; CHECK-SSE1-NEXT: movzbl 26(%r12), %r8d ; CHECK-SSE1-NEXT: xorb %al, %r8b ; CHECK-SSE1-NEXT: andb 26(%rcx), %r8b ; CHECK-SSE1-NEXT: xorb %al, %r8b ; CHECK-SSE1-NEXT: movzbl 27(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 27(%rbx), %edi +; CHECK-SSE1-NEXT: movzbl 27(%r12), %edi ; CHECK-SSE1-NEXT: xorb %al, %dil ; CHECK-SSE1-NEXT: andb 27(%rcx), %dil ; CHECK-SSE1-NEXT: xorb %al, %dil ; CHECK-SSE1-NEXT: movzbl 28(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 28(%rbx), %edx -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 28(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: movzbl 28(%r12), %esi +; CHECK-SSE1-NEXT: xorb %al, %sil +; CHECK-SSE1-NEXT: andb 28(%rcx), %sil +; CHECK-SSE1-NEXT: xorb %al, %sil ; CHECK-SSE1-NEXT: movzbl 29(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 29(%rbx), %ecx +; CHECK-SSE1-NEXT: movzbl 29(%r12), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 29(%rsi), %cl +; CHECK-SSE1-NEXT: andb 29(%rdx), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movzbl 30(%r13), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 30(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 30(%r12), %eax ; CHECK-SSE1-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload -; CHECK-SSE1-NEXT: andb 30(%rsi), %al +; CHECK-SSE1-NEXT: andb 30(%rdx), %al ; CHECK-SSE1-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload ; CHECK-SSE1-NEXT: movzbl 31(%r13), %r13d -; CHECK-SSE1-NEXT: movzbl 31(%rbx), %ebx -; CHECK-SSE1-NEXT: xorb %r13b, %bl -; CHECK-SSE1-NEXT: andb 31(%rsi), %bl -; CHECK-SSE1-NEXT: xorb %r13b, %bl +; CHECK-SSE1-NEXT: movzbl 31(%r12), %r12d +; CHECK-SSE1-NEXT: xorb %r13b, %r12b +; CHECK-SSE1-NEXT: andb 31(%rdx), %r12b +; CHECK-SSE1-NEXT: xorb %r13b, %r12b ; CHECK-SSE1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 31(%r13) +; CHECK-SSE1-NEXT: movb %r12b, 31(%r13) ; CHECK-SSE1-NEXT: movb %al, 30(%r13) ; CHECK-SSE1-NEXT: movb %cl, 29(%r13) -; CHECK-SSE1-NEXT: movb %dl, 28(%r13) +; CHECK-SSE1-NEXT: movb %sil, 28(%r13) ; CHECK-SSE1-NEXT: movb %dil, 27(%r13) ; CHECK-SSE1-NEXT: movb %r8b, 26(%r13) ; CHECK-SSE1-NEXT: movb %r9b, 25(%r13) ; CHECK-SSE1-NEXT: movb %r10b, 24(%r13) ; CHECK-SSE1-NEXT: movb %r11b, 23(%r13) -; CHECK-SSE1-NEXT: movb %bpl, 22(%r13) -; CHECK-SSE1-NEXT: movb %r14b, 21(%r13) -; CHECK-SSE1-NEXT: movb %r15b, 20(%r13) -; CHECK-SSE1-NEXT: movb %r12b, 19(%r13) +; CHECK-SSE1-NEXT: movb %bl, 22(%r13) +; CHECK-SSE1-NEXT: movb %bpl, 21(%r13) +; CHECK-SSE1-NEXT: movb %r14b, 20(%r13) +; CHECK-SSE1-NEXT: movb %r15b, 19(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: movb %al, 18(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload @@ -3811,22 +3799,22 @@ ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 22(%rdx), %eax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 20(%rdx), %r11d +; CHECK-BASELINE-NEXT: movl 20(%rdx), %r8d +; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r11d ; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r14d +; CHECK-BASELINE-NEXT: movl 16(%rdx), %ebx +; CHECK-BASELINE-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %ebp +; CHECK-BASELINE-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl 12(%rdx), %r14d ; CHECK-BASELINE-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 16(%rdx), %r15d +; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r15d ; CHECK-BASELINE-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %r12d +; CHECK-BASELINE-NEXT: movl 8(%rdx), %r12d ; CHECK-BASELINE-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 12(%rdx), %r13d +; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %r13d ; CHECK-BASELINE-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r8d -; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 8(%rdx), %ebx -; CHECK-BASELINE-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %ebp -; CHECK-BASELINE-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl (%rdx), %ecx ; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl 4(%rdx), %edi @@ -3842,24 +3830,23 @@ ; CHECK-BASELINE-NEXT: movzwl 4(%rsi), %eax ; CHECK-BASELINE-NEXT: xorw %di, %ax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %edx -; CHECK-BASELINE-NEXT: xorw %bp, %dx -; CHECK-BASELINE-NEXT: movl %edx, %eax -; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %ecx -; CHECK-BASELINE-NEXT: xorw %bx, %cx -; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %edx -; CHECK-BASELINE-NEXT: xorw %r8w, %dx -; CHECK-BASELINE-NEXT: movl %edx, %r8d +; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %ecx +; CHECK-BASELINE-NEXT: xorw %r13w, %cx +; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r12w, %ax +; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r15w, %ax ; CHECK-BASELINE-NEXT: movzwl 12(%rsi), %edx -; CHECK-BASELINE-NEXT: xorw %r13w, %dx +; CHECK-BASELINE-NEXT: xorw %r14w, %dx ; CHECK-BASELINE-NEXT: movzwl 14(%rsi), %r13d -; CHECK-BASELINE-NEXT: xorw %r12w, %r13w +; CHECK-BASELINE-NEXT: xorw %bp, %r13w ; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %r12d -; CHECK-BASELINE-NEXT: xorw %r15w, %r12w +; CHECK-BASELINE-NEXT: xorw %bx, %r12w ; CHECK-BASELINE-NEXT: movzwl 18(%rsi), %r15d -; CHECK-BASELINE-NEXT: xorw %r14w, %r15w +; CHECK-BASELINE-NEXT: xorw %r11w, %r15w ; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %r14d -; CHECK-BASELINE-NEXT: xorw %r11w, %r14w +; CHECK-BASELINE-NEXT: xorw %r8w, %r14w ; CHECK-BASELINE-NEXT: movzwl 22(%rsi), %ebp ; CHECK-BASELINE-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload ; CHECK-BASELINE-NEXT: movzwl 24(%rsi), %ebx @@ -3881,12 +3868,12 @@ ; CHECK-BASELINE-NEXT: andw 14(%r9), %r13w ; CHECK-BASELINE-NEXT: andw 12(%r9), %dx ; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: andw 10(%r9), %r8w -; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl %ecx, %edx -; CHECK-BASELINE-NEXT: andw 8(%r9), %dx -; CHECK-BASELINE-NEXT: andw 6(%r9), %ax +; CHECK-BASELINE-NEXT: andw 10(%r9), %ax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; CHECK-BASELINE-NEXT: andw 8(%r9), %dx +; CHECK-BASELINE-NEXT: andw 6(%r9), %cx +; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload ; CHECK-BASELINE-NEXT: andw 4(%r9), %r8w ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload @@ -3962,22 +3949,22 @@ ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 22(%rdx), %eax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 20(%rdx), %r11d +; CHECK-SSE1-NEXT: movl 20(%rdx), %r8d +; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r11d ; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r14d +; CHECK-SSE1-NEXT: movl 16(%rdx), %ebx +; CHECK-SSE1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 14(%rdx), %ebp +; CHECK-SSE1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl 12(%rdx), %r14d ; CHECK-SSE1-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 16(%rdx), %r15d +; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r15d ; CHECK-SSE1-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 14(%rdx), %r12d +; CHECK-SSE1-NEXT: movl 8(%rdx), %r12d ; CHECK-SSE1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 12(%rdx), %r13d +; CHECK-SSE1-NEXT: movzwl 6(%rdx), %r13d ; CHECK-SSE1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r8d -; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 8(%rdx), %ebx -; CHECK-SSE1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 6(%rdx), %ebp -; CHECK-SSE1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl (%rdx), %ecx ; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl 4(%rdx), %edi @@ -3993,24 +3980,23 @@ ; CHECK-SSE1-NEXT: movzwl 4(%rsi), %eax ; CHECK-SSE1-NEXT: xorw %di, %ax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 6(%rsi), %edx -; CHECK-SSE1-NEXT: xorw %bp, %dx -; CHECK-SSE1-NEXT: movl %edx, %eax -; CHECK-SSE1-NEXT: movzwl 8(%rsi), %ecx -; CHECK-SSE1-NEXT: xorw %bx, %cx -; CHECK-SSE1-NEXT: movzwl 10(%rsi), %edx -; CHECK-SSE1-NEXT: xorw %r8w, %dx -; CHECK-SSE1-NEXT: movl %edx, %r8d +; CHECK-SSE1-NEXT: movzwl 6(%rsi), %ecx +; CHECK-SSE1-NEXT: xorw %r13w, %cx +; CHECK-SSE1-NEXT: movzwl 8(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r12w, %ax +; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 10(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r15w, %ax ; CHECK-SSE1-NEXT: movzwl 12(%rsi), %edx -; CHECK-SSE1-NEXT: xorw %r13w, %dx +; CHECK-SSE1-NEXT: xorw %r14w, %dx ; CHECK-SSE1-NEXT: movzwl 14(%rsi), %r13d -; CHECK-SSE1-NEXT: xorw %r12w, %r13w +; CHECK-SSE1-NEXT: xorw %bp, %r13w ; CHECK-SSE1-NEXT: movzwl 16(%rsi), %r12d -; CHECK-SSE1-NEXT: xorw %r15w, %r12w +; CHECK-SSE1-NEXT: xorw %bx, %r12w ; CHECK-SSE1-NEXT: movzwl 18(%rsi), %r15d -; CHECK-SSE1-NEXT: xorw %r14w, %r15w +; CHECK-SSE1-NEXT: xorw %r11w, %r15w ; CHECK-SSE1-NEXT: movzwl 20(%rsi), %r14d -; CHECK-SSE1-NEXT: xorw %r11w, %r14w +; CHECK-SSE1-NEXT: xorw %r8w, %r14w ; CHECK-SSE1-NEXT: movzwl 22(%rsi), %ebp ; CHECK-SSE1-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload ; CHECK-SSE1-NEXT: movzwl 24(%rsi), %ebx @@ -4032,12 +4018,12 @@ ; CHECK-SSE1-NEXT: andw 14(%r9), %r13w ; CHECK-SSE1-NEXT: andw 12(%r9), %dx ; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: andw 10(%r9), %r8w -; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl %ecx, %edx -; CHECK-SSE1-NEXT: andw 8(%r9), %dx -; CHECK-SSE1-NEXT: andw 6(%r9), %ax +; CHECK-SSE1-NEXT: andw 10(%r9), %ax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; CHECK-SSE1-NEXT: andw 8(%r9), %dx +; CHECK-SSE1-NEXT: andw 6(%r9), %cx +; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload ; CHECK-SSE1-NEXT: andw 4(%r9), %r8w ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload @@ -4131,57 +4117,57 @@ ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movl 28(%rdx), %r15d -; CHECK-BASELINE-NEXT: movl 24(%rdx), %r14d +; CHECK-BASELINE-NEXT: movl 28(%rdx), %ebp +; CHECK-BASELINE-NEXT: movl 24(%rdx), %ebx ; CHECK-BASELINE-NEXT: movl 20(%rdx), %r10d ; CHECK-BASELINE-NEXT: movl 16(%rdx), %eax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 12(%rdx), %ebp -; CHECK-BASELINE-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 8(%rdx), %ebx -; CHECK-BASELINE-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl (%rdx), %r12d +; CHECK-BASELINE-NEXT: movl 12(%rdx), %r12d +; CHECK-BASELINE-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl 8(%rdx), %r14d +; CHECK-BASELINE-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl (%rdx), %r15d ; CHECK-BASELINE-NEXT: movl 4(%rdx), %r13d -; CHECK-BASELINE-NEXT: movl (%rsi), %r11d -; CHECK-BASELINE-NEXT: xorl %r12d, %r11d +; CHECK-BASELINE-NEXT: movl (%rsi), %r8d +; CHECK-BASELINE-NEXT: xorl %r15d, %r8d ; CHECK-BASELINE-NEXT: movl 4(%rsi), %r9d ; CHECK-BASELINE-NEXT: xorl %r13d, %r9d -; CHECK-BASELINE-NEXT: movl 8(%rsi), %r8d -; CHECK-BASELINE-NEXT: xorl %ebx, %r8d -; CHECK-BASELINE-NEXT: movl 12(%rsi), %ebx -; CHECK-BASELINE-NEXT: xorl %ebp, %ebx -; CHECK-BASELINE-NEXT: movl 16(%rsi), %ebp -; CHECK-BASELINE-NEXT: xorl %eax, %ebp +; CHECK-BASELINE-NEXT: movl 8(%rsi), %r11d +; CHECK-BASELINE-NEXT: xorl %r14d, %r11d +; CHECK-BASELINE-NEXT: movl 12(%rsi), %r14d +; CHECK-BASELINE-NEXT: xorl %r12d, %r14d +; CHECK-BASELINE-NEXT: movl 16(%rsi), %r12d +; CHECK-BASELINE-NEXT: xorl %eax, %r12d ; CHECK-BASELINE-NEXT: movl 20(%rsi), %edx ; CHECK-BASELINE-NEXT: xorl %r10d, %edx ; CHECK-BASELINE-NEXT: movl 24(%rsi), %eax -; CHECK-BASELINE-NEXT: xorl %r14d, %eax +; CHECK-BASELINE-NEXT: xorl %ebx, %eax ; CHECK-BASELINE-NEXT: movl 28(%rsi), %esi -; CHECK-BASELINE-NEXT: xorl %r15d, %esi +; CHECK-BASELINE-NEXT: xorl %ebp, %esi ; CHECK-BASELINE-NEXT: andl 28(%rcx), %esi ; CHECK-BASELINE-NEXT: andl 24(%rcx), %eax ; CHECK-BASELINE-NEXT: andl 20(%rcx), %edx -; CHECK-BASELINE-NEXT: andl 16(%rcx), %ebp -; CHECK-BASELINE-NEXT: andl 12(%rcx), %ebx -; CHECK-BASELINE-NEXT: andl 8(%rcx), %r8d +; CHECK-BASELINE-NEXT: andl 16(%rcx), %r12d +; CHECK-BASELINE-NEXT: andl 12(%rcx), %r14d +; CHECK-BASELINE-NEXT: andl 8(%rcx), %r11d ; CHECK-BASELINE-NEXT: andl 4(%rcx), %r9d -; CHECK-BASELINE-NEXT: andl (%rcx), %r11d -; CHECK-BASELINE-NEXT: xorl %r12d, %r11d +; CHECK-BASELINE-NEXT: andl (%rcx), %r8d +; CHECK-BASELINE-NEXT: xorl %r15d, %r8d ; CHECK-BASELINE-NEXT: xorl %r13d, %r9d -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl %r10d, %edx -; CHECK-BASELINE-NEXT: xorl %r14d, %eax -; CHECK-BASELINE-NEXT: xorl %r15d, %esi +; CHECK-BASELINE-NEXT: xorl %ebx, %eax +; CHECK-BASELINE-NEXT: xorl %ebp, %esi ; CHECK-BASELINE-NEXT: movl %esi, 28(%rdi) ; CHECK-BASELINE-NEXT: movl %eax, 24(%rdi) ; CHECK-BASELINE-NEXT: movl %edx, 20(%rdi) -; CHECK-BASELINE-NEXT: movl %ebp, 16(%rdi) -; CHECK-BASELINE-NEXT: movl %ebx, 12(%rdi) -; CHECK-BASELINE-NEXT: movl %r8d, 8(%rdi) +; CHECK-BASELINE-NEXT: movl %r12d, 16(%rdi) +; CHECK-BASELINE-NEXT: movl %r14d, 12(%rdi) +; CHECK-BASELINE-NEXT: movl %r11d, 8(%rdi) ; CHECK-BASELINE-NEXT: movl %r9d, 4(%rdi) -; CHECK-BASELINE-NEXT: movl %r11d, (%rdi) +; CHECK-BASELINE-NEXT: movl %r8d, (%rdi) ; CHECK-BASELINE-NEXT: movq %rdi, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 @@ -4199,57 +4185,57 @@ ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movl 28(%rdx), %r15d -; CHECK-SSE1-NEXT: movl 24(%rdx), %r14d +; CHECK-SSE1-NEXT: movl 28(%rdx), %ebp +; CHECK-SSE1-NEXT: movl 24(%rdx), %ebx ; CHECK-SSE1-NEXT: movl 20(%rdx), %r10d ; CHECK-SSE1-NEXT: movl 16(%rdx), %eax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 12(%rdx), %ebp -; CHECK-SSE1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 8(%rdx), %ebx -; CHECK-SSE1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl (%rdx), %r12d +; CHECK-SSE1-NEXT: movl 12(%rdx), %r12d +; CHECK-SSE1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl 8(%rdx), %r14d +; CHECK-SSE1-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl (%rdx), %r15d ; CHECK-SSE1-NEXT: movl 4(%rdx), %r13d -; CHECK-SSE1-NEXT: movl (%rsi), %r11d -; CHECK-SSE1-NEXT: xorl %r12d, %r11d +; CHECK-SSE1-NEXT: movl (%rsi), %r8d +; CHECK-SSE1-NEXT: xorl %r15d, %r8d ; CHECK-SSE1-NEXT: movl 4(%rsi), %r9d ; CHECK-SSE1-NEXT: xorl %r13d, %r9d -; CHECK-SSE1-NEXT: movl 8(%rsi), %r8d -; CHECK-SSE1-NEXT: xorl %ebx, %r8d -; CHECK-SSE1-NEXT: movl 12(%rsi), %ebx -; CHECK-SSE1-NEXT: xorl %ebp, %ebx -; CHECK-SSE1-NEXT: movl 16(%rsi), %ebp -; CHECK-SSE1-NEXT: xorl %eax, %ebp +; CHECK-SSE1-NEXT: movl 8(%rsi), %r11d +; CHECK-SSE1-NEXT: xorl %r14d, %r11d +; CHECK-SSE1-NEXT: movl 12(%rsi), %r14d +; CHECK-SSE1-NEXT: xorl %r12d, %r14d +; CHECK-SSE1-NEXT: movl 16(%rsi), %r12d +; CHECK-SSE1-NEXT: xorl %eax, %r12d ; CHECK-SSE1-NEXT: movl 20(%rsi), %edx ; CHECK-SSE1-NEXT: xorl %r10d, %edx ; CHECK-SSE1-NEXT: movl 24(%rsi), %eax -; CHECK-SSE1-NEXT: xorl %r14d, %eax +; CHECK-SSE1-NEXT: xorl %ebx, %eax ; CHECK-SSE1-NEXT: movl 28(%rsi), %esi -; CHECK-SSE1-NEXT: xorl %r15d, %esi +; CHECK-SSE1-NEXT: xorl %ebp, %esi ; CHECK-SSE1-NEXT: andl 28(%rcx), %esi ; CHECK-SSE1-NEXT: andl 24(%rcx), %eax ; CHECK-SSE1-NEXT: andl 20(%rcx), %edx -; CHECK-SSE1-NEXT: andl 16(%rcx), %ebp -; CHECK-SSE1-NEXT: andl 12(%rcx), %ebx -; CHECK-SSE1-NEXT: andl 8(%rcx), %r8d +; CHECK-SSE1-NEXT: andl 16(%rcx), %r12d +; CHECK-SSE1-NEXT: andl 12(%rcx), %r14d +; CHECK-SSE1-NEXT: andl 8(%rcx), %r11d ; CHECK-SSE1-NEXT: andl 4(%rcx), %r9d -; CHECK-SSE1-NEXT: andl (%rcx), %r11d -; CHECK-SSE1-NEXT: xorl %r12d, %r11d +; CHECK-SSE1-NEXT: andl (%rcx), %r8d +; CHECK-SSE1-NEXT: xorl %r15d, %r8d ; CHECK-SSE1-NEXT: xorl %r13d, %r9d -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl %r10d, %edx -; CHECK-SSE1-NEXT: xorl %r14d, %eax -; CHECK-SSE1-NEXT: xorl %r15d, %esi +; CHECK-SSE1-NEXT: xorl %ebx, %eax +; CHECK-SSE1-NEXT: xorl %ebp, %esi ; CHECK-SSE1-NEXT: movl %esi, 28(%rdi) ; CHECK-SSE1-NEXT: movl %eax, 24(%rdi) ; CHECK-SSE1-NEXT: movl %edx, 20(%rdi) -; CHECK-SSE1-NEXT: movl %ebp, 16(%rdi) -; CHECK-SSE1-NEXT: movl %ebx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %r8d, 8(%rdi) +; CHECK-SSE1-NEXT: movl %r12d, 16(%rdi) +; CHECK-SSE1-NEXT: movl %r14d, 12(%rdi) +; CHECK-SSE1-NEXT: movl %r11d, 8(%rdi) ; CHECK-SSE1-NEXT: movl %r9d, 4(%rdi) -; CHECK-SSE1-NEXT: movl %r11d, (%rdi) +; CHECK-SSE1-NEXT: movl %r8d, (%rdi) ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 @@ -4293,29 +4279,29 @@ ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movq 24(%rdx), %r8 -; CHECK-BASELINE-NEXT: movq 16(%rdx), %r9 -; CHECK-BASELINE-NEXT: movq (%rdx), %r11 +; CHECK-BASELINE-NEXT: movq 24(%rdx), %rdi +; CHECK-BASELINE-NEXT: movq 16(%rdx), %r8 +; CHECK-BASELINE-NEXT: movq (%rdx), %r9 ; CHECK-BASELINE-NEXT: movq 8(%rdx), %r10 ; CHECK-BASELINE-NEXT: movq (%rsi), %rdx -; CHECK-BASELINE-NEXT: xorq %r11, %rdx -; CHECK-BASELINE-NEXT: movq 8(%rsi), %rdi -; CHECK-BASELINE-NEXT: xorq %r10, %rdi +; CHECK-BASELINE-NEXT: xorq %r9, %rdx +; CHECK-BASELINE-NEXT: movq 8(%rsi), %r11 +; CHECK-BASELINE-NEXT: xorq %r10, %r11 ; CHECK-BASELINE-NEXT: movq 16(%rsi), %rbx -; CHECK-BASELINE-NEXT: xorq %r9, %rbx +; CHECK-BASELINE-NEXT: xorq %r8, %rbx ; CHECK-BASELINE-NEXT: movq 24(%rsi), %rsi -; CHECK-BASELINE-NEXT: xorq %r8, %rsi +; CHECK-BASELINE-NEXT: xorq %rdi, %rsi ; CHECK-BASELINE-NEXT: andq 24(%rcx), %rsi ; CHECK-BASELINE-NEXT: andq 16(%rcx), %rbx -; CHECK-BASELINE-NEXT: andq 8(%rcx), %rdi +; CHECK-BASELINE-NEXT: andq 8(%rcx), %r11 ; CHECK-BASELINE-NEXT: andq (%rcx), %rdx -; CHECK-BASELINE-NEXT: xorq %r11, %rdx -; CHECK-BASELINE-NEXT: xorq %r10, %rdi -; CHECK-BASELINE-NEXT: xorq %r9, %rbx -; CHECK-BASELINE-NEXT: xorq %r8, %rsi +; CHECK-BASELINE-NEXT: xorq %r9, %rdx +; CHECK-BASELINE-NEXT: xorq %r10, %r11 +; CHECK-BASELINE-NEXT: xorq %r8, %rbx +; CHECK-BASELINE-NEXT: xorq %rdi, %rsi ; CHECK-BASELINE-NEXT: movq %rsi, 24(%rax) ; CHECK-BASELINE-NEXT: movq %rbx, 16(%rax) -; CHECK-BASELINE-NEXT: movq %rdi, 8(%rax) +; CHECK-BASELINE-NEXT: movq %r11, 8(%rax) ; CHECK-BASELINE-NEXT: movq %rdx, (%rax) ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: retq @@ -4324,29 +4310,29 @@ ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movq 24(%rdx), %r8 -; CHECK-SSE1-NEXT: movq 16(%rdx), %r9 -; CHECK-SSE1-NEXT: movq (%rdx), %r11 +; CHECK-SSE1-NEXT: movq 24(%rdx), %rdi +; CHECK-SSE1-NEXT: movq 16(%rdx), %r8 +; CHECK-SSE1-NEXT: movq (%rdx), %r9 ; CHECK-SSE1-NEXT: movq 8(%rdx), %r10 ; CHECK-SSE1-NEXT: movq (%rsi), %rdx -; CHECK-SSE1-NEXT: xorq %r11, %rdx -; CHECK-SSE1-NEXT: movq 8(%rsi), %rdi -; CHECK-SSE1-NEXT: xorq %r10, %rdi +; CHECK-SSE1-NEXT: xorq %r9, %rdx +; CHECK-SSE1-NEXT: movq 8(%rsi), %r11 +; CHECK-SSE1-NEXT: xorq %r10, %r11 ; CHECK-SSE1-NEXT: movq 16(%rsi), %rbx -; CHECK-SSE1-NEXT: xorq %r9, %rbx +; CHECK-SSE1-NEXT: xorq %r8, %rbx ; CHECK-SSE1-NEXT: movq 24(%rsi), %rsi -; CHECK-SSE1-NEXT: xorq %r8, %rsi +; CHECK-SSE1-NEXT: xorq %rdi, %rsi ; CHECK-SSE1-NEXT: andq 24(%rcx), %rsi ; CHECK-SSE1-NEXT: andq 16(%rcx), %rbx -; CHECK-SSE1-NEXT: andq 8(%rcx), %rdi +; CHECK-SSE1-NEXT: andq 8(%rcx), %r11 ; CHECK-SSE1-NEXT: andq (%rcx), %rdx -; CHECK-SSE1-NEXT: xorq %r11, %rdx -; CHECK-SSE1-NEXT: xorq %r10, %rdi -; CHECK-SSE1-NEXT: xorq %r9, %rbx -; CHECK-SSE1-NEXT: xorq %r8, %rsi +; CHECK-SSE1-NEXT: xorq %r9, %rdx +; CHECK-SSE1-NEXT: xorq %r10, %r11 +; CHECK-SSE1-NEXT: xorq %r8, %rbx +; CHECK-SSE1-NEXT: xorq %rdi, %rsi ; CHECK-SSE1-NEXT: movq %rsi, 24(%rax) ; CHECK-SSE1-NEXT: movq %rbx, 16(%rax) -; CHECK-SSE1-NEXT: movq %rdi, 8(%rax) +; CHECK-SSE1-NEXT: movq %r11, 8(%rax) ; CHECK-SSE1-NEXT: movq %rdx, (%rax) ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -932,24 +932,24 @@ ; SSE-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] -; SSE-NEXT: pand %xmm11, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE-NEXT: por %xmm9, %xmm4 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: psubq %xmm5, %xmm1 ; SSE-NEXT: pxor %xmm8, %xmm5 -; SSE-NEXT: pxor %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm5, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] -; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: pxor %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: psubq %xmm6, %xmm2 ; SSE-NEXT: pxor %xmm8, %xmm6 @@ -982,10 +982,10 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm8 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm6, %xmm6 +; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm6 ; AVX1-NEXT: vpsubq %xmm4, %xmm7, %xmm4 ; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm4 ; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm6 diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -129,42 +129,42 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind { ; SSE3-LABEL: var_shuffle_v8i16: ; SSE3: # %bb.0: -; SSE3-NEXT: movd %xmm1, %r8d -; SSE3-NEXT: pextrw $1, %xmm1, %r9d -; SSE3-NEXT: pextrw $2, %xmm1, %r10d +; SSE3-NEXT: movd %xmm1, %eax +; SSE3-NEXT: pextrw $1, %xmm1, %ecx +; SSE3-NEXT: pextrw $2, %xmm1, %edx ; SSE3-NEXT: pextrw $3, %xmm1, %esi ; SSE3-NEXT: pextrw $4, %xmm1, %edi -; SSE3-NEXT: pextrw $5, %xmm1, %eax -; SSE3-NEXT: pextrw $6, %xmm1, %ecx -; SSE3-NEXT: pextrw $7, %xmm1, %edx +; SSE3-NEXT: pextrw $5, %xmm1, %r8d +; SSE3-NEXT: pextrw $6, %xmm1, %r9d +; SSE3-NEXT: pextrw $7, %xmm1, %r10d ; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: andl $7, %r8d -; SSE3-NEXT: andl $7, %r9d -; SSE3-NEXT: andl $7, %r10d -; SSE3-NEXT: andl $7, %esi -; SSE3-NEXT: andl $7, %edi ; SSE3-NEXT: andl $7, %eax ; SSE3-NEXT: andl $7, %ecx ; SSE3-NEXT: andl $7, %edx -; SSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx -; SSE3-NEXT: movd %edx, %xmm0 -; SSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx -; SSE3-NEXT: movd %ecx, %xmm1 +; SSE3-NEXT: andl $7, %esi +; SSE3-NEXT: andl $7, %edi +; SSE3-NEXT: andl $7, %r8d +; SSE3-NEXT: andl $7, %r9d +; SSE3-NEXT: andl $7, %r10d +; SSE3-NEXT: movzwl -24(%rsp,%r10,2), %r10d +; SSE3-NEXT: movd %r10d, %xmm0 +; SSE3-NEXT: movzwl -24(%rsp,%r9,2), %r9d +; SSE3-NEXT: movd %r9d, %xmm1 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax -; SSE3-NEXT: movd %eax, %xmm0 -; SSE3-NEXT: movzwl -24(%rsp,%rdi,2), %eax -; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: movzwl -24(%rsp,%r8,2), %r8d +; SSE3-NEXT: movd %r8d, %xmm0 +; SSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi +; SSE3-NEXT: movd %edi, %xmm2 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax -; SSE3-NEXT: movd %eax, %xmm0 -; SSE3-NEXT: movzwl -24(%rsp,%r10,2), %eax -; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi +; SSE3-NEXT: movd %esi, %xmm0 +; SSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx +; SSE3-NEXT: movd %edx, %xmm1 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax -; SSE3-NEXT: movd %eax, %xmm3 -; SSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax +; SSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx +; SSE3-NEXT: movd %ecx, %xmm3 +; SSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax ; SSE3-NEXT: movd %eax, %xmm0 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -231,15 +231,15 @@ ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm8 +; SSE3-NEXT: movd %eax, %xmm1 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm15 +; SSE3-NEXT: movd %eax, %xmm2 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm9 +; SSE3-NEXT: movd %eax, %xmm4 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -247,7 +247,7 @@ ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm10 +; SSE3-NEXT: movd %eax, %xmm5 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -255,7 +255,7 @@ ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm11 +; SSE3-NEXT: movd %eax, %xmm8 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -263,49 +263,49 @@ ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm12 +; SSE3-NEXT: movd %eax, %xmm9 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm5 +; SSE3-NEXT: movd %eax, %xmm10 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm13 +; SSE3-NEXT: movd %eax, %xmm11 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm4 +; SSE3-NEXT: movd %eax, %xmm12 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm14 +; SSE3-NEXT: movd %eax, %xmm13 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: movd %eax, %xmm14 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: movd %eax, %xmm15 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm0 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE3-NEXT: retq ; @@ -495,15 +495,15 @@ ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm8 +; SSE3-NEXT: movd %eax, %xmm1 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm15 +; SSE3-NEXT: movd %eax, %xmm2 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm9 +; SSE3-NEXT: movd %eax, %xmm4 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -511,7 +511,7 @@ ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm10 +; SSE3-NEXT: movd %eax, %xmm5 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -519,7 +519,7 @@ ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm11 +; SSE3-NEXT: movd %eax, %xmm8 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -527,49 +527,49 @@ ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm12 +; SSE3-NEXT: movd %eax, %xmm9 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm5 +; SSE3-NEXT: movd %eax, %xmm10 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm13 +; SSE3-NEXT: movd %eax, %xmm11 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm4 +; SSE3-NEXT: movd %eax, %xmm12 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm14 +; SSE3-NEXT: movd %eax, %xmm13 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: movd %eax, %xmm14 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: movd %eax, %xmm15 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm0 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE3-NEXT: retq ; @@ -656,114 +656,112 @@ ; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, (%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d ; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d ; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp ; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSE3-NEXT: andl $31, %r8d -; SSE3-NEXT: movzbl -96(%rsp,%r8), %esi -; SSE3-NEXT: movd %esi, %xmm8 -; SSE3-NEXT: andl $31, %ebp -; SSE3-NEXT: movzbl -64(%rsp,%rbp), %esi -; SSE3-NEXT: movd %esi, %xmm15 -; SSE3-NEXT: andl $31, %edx -; SSE3-NEXT: movzbl -32(%rsp,%rdx), %edx -; SSE3-NEXT: movd %edx, %xmm9 -; SSE3-NEXT: andl $31, %ecx -; SSE3-NEXT: movzbl (%rsp,%rcx), %ecx -; SSE3-NEXT: movd %ecx, %xmm3 +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $31, %eax -; SSE3-NEXT: movzbl 32(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm10 -; SSE3-NEXT: andl $31, %edi -; SSE3-NEXT: movzbl 64(%rsp,%rdi), %eax +; SSE3-NEXT: movzbl -96(%rsp,%rax), %eax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: andl $31, %ebp +; SSE3-NEXT: movzbl -64(%rsp,%rbp), %eax +; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: andl $31, %r13d +; SSE3-NEXT: movzbl -32(%rsp,%r13), %eax +; SSE3-NEXT: movd %eax, %xmm4 +; SSE3-NEXT: andl $31, %r12d +; SSE3-NEXT: movzbl (%rsp,%r12), %eax +; SSE3-NEXT: movd %eax, %xmm3 +; SSE3-NEXT: andl $31, %r15d +; SSE3-NEXT: movzbl 32(%rsp,%r15), %eax +; SSE3-NEXT: movd %eax, %xmm5 +; SSE3-NEXT: andl $31, %r14d +; SSE3-NEXT: movzbl 64(%rsp,%r14), %eax ; SSE3-NEXT: movd %eax, %xmm7 ; SSE3-NEXT: andl $31, %ebx ; SSE3-NEXT: movzbl 96(%rsp,%rbx), %eax -; SSE3-NEXT: movd %eax, %xmm11 -; SSE3-NEXT: andl $31, %r9d -; SSE3-NEXT: movzbl 128(%rsp,%r9), %eax +; SSE3-NEXT: movd %eax, %xmm8 +; SSE3-NEXT: andl $31, %r11d +; SSE3-NEXT: movzbl 128(%rsp,%r11), %eax ; SSE3-NEXT: movd %eax, %xmm6 -; SSE3-NEXT: andl $31, %r13d -; SSE3-NEXT: movzbl 160(%rsp,%r13), %eax +; SSE3-NEXT: andl $31, %r10d +; SSE3-NEXT: movzbl 160(%rsp,%r10), %eax +; SSE3-NEXT: movd %eax, %xmm9 +; SSE3-NEXT: andl $31, %r9d +; SSE3-NEXT: movzbl 192(%rsp,%r9), %eax +; SSE3-NEXT: movd %eax, %xmm10 +; SSE3-NEXT: andl $31, %r8d +; SSE3-NEXT: movzbl 224(%rsp,%r8), %eax +; SSE3-NEXT: movd %eax, %xmm11 +; SSE3-NEXT: andl $31, %edi +; SSE3-NEXT: movzbl 256(%rsp,%rdi), %eax ; SSE3-NEXT: movd %eax, %xmm12 -; SSE3-NEXT: andl $31, %r12d -; SSE3-NEXT: movzbl 192(%rsp,%r12), %eax -; SSE3-NEXT: movd %eax, %xmm5 -; SSE3-NEXT: andl $31, %r15d -; SSE3-NEXT: movzbl 224(%rsp,%r15), %eax +; SSE3-NEXT: andl $31, %esi +; SSE3-NEXT: movzbl 288(%rsp,%rsi), %eax ; SSE3-NEXT: movd %eax, %xmm13 -; SSE3-NEXT: andl $31, %r14d -; SSE3-NEXT: movzbl 256(%rsp,%r14), %eax -; SSE3-NEXT: movd %eax, %xmm4 -; SSE3-NEXT: andl $31, %r11d -; SSE3-NEXT: movzbl 288(%rsp,%r11), %eax +; SSE3-NEXT: andl $31, %edx +; SSE3-NEXT: movzbl 320(%rsp,%rdx), %eax ; SSE3-NEXT: movd %eax, %xmm14 -; SSE3-NEXT: andl $31, %r10d -; SSE3-NEXT: movzbl 320(%rsp,%r10), %eax -; SSE3-NEXT: movd %eax, %xmm1 -; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE3-NEXT: andl $31, %eax -; SSE3-NEXT: movzbl 352(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: andl $31, %ecx +; SSE3-NEXT: movzbl 352(%rsp,%rcx), %eax +; SSE3-NEXT: movd %eax, %xmm15 ; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE3-NEXT: andl $31, %eax ; SSE3-NEXT: movzbl 384(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm0 -; SSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE3-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSE3-NEXT: popq %rbx @@ -790,114 +788,112 @@ ; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, (%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d ; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d ; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp ; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSSE3-NEXT: andl $31, %r8d -; SSSE3-NEXT: movzbl -96(%rsp,%r8), %esi -; SSSE3-NEXT: movd %esi, %xmm8 -; SSSE3-NEXT: andl $31, %ebp -; SSSE3-NEXT: movzbl -64(%rsp,%rbp), %esi -; SSSE3-NEXT: movd %esi, %xmm15 -; SSSE3-NEXT: andl $31, %edx -; SSSE3-NEXT: movzbl -32(%rsp,%rdx), %edx -; SSSE3-NEXT: movd %edx, %xmm9 -; SSSE3-NEXT: andl $31, %ecx -; SSSE3-NEXT: movzbl (%rsp,%rcx), %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $31, %eax -; SSSE3-NEXT: movzbl 32(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm10 -; SSSE3-NEXT: andl $31, %edi -; SSSE3-NEXT: movzbl 64(%rsp,%rdi), %eax +; SSSE3-NEXT: movzbl -96(%rsp,%rax), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: andl $31, %ebp +; SSSE3-NEXT: movzbl -64(%rsp,%rbp), %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: andl $31, %r13d +; SSSE3-NEXT: movzbl -32(%rsp,%r13), %eax +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: andl $31, %r12d +; SSSE3-NEXT: movzbl (%rsp,%r12), %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: andl $31, %r15d +; SSSE3-NEXT: movzbl 32(%rsp,%r15), %eax +; SSSE3-NEXT: movd %eax, %xmm5 +; SSSE3-NEXT: andl $31, %r14d +; SSSE3-NEXT: movzbl 64(%rsp,%r14), %eax ; SSSE3-NEXT: movd %eax, %xmm7 ; SSSE3-NEXT: andl $31, %ebx ; SSSE3-NEXT: movzbl 96(%rsp,%rbx), %eax -; SSSE3-NEXT: movd %eax, %xmm11 -; SSSE3-NEXT: andl $31, %r9d -; SSSE3-NEXT: movzbl 128(%rsp,%r9), %eax +; SSSE3-NEXT: movd %eax, %xmm8 +; SSSE3-NEXT: andl $31, %r11d +; SSSE3-NEXT: movzbl 128(%rsp,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm6 -; SSSE3-NEXT: andl $31, %r13d -; SSSE3-NEXT: movzbl 160(%rsp,%r13), %eax +; SSSE3-NEXT: andl $31, %r10d +; SSSE3-NEXT: movzbl 160(%rsp,%r10), %eax +; SSSE3-NEXT: movd %eax, %xmm9 +; SSSE3-NEXT: andl $31, %r9d +; SSSE3-NEXT: movzbl 192(%rsp,%r9), %eax +; SSSE3-NEXT: movd %eax, %xmm10 +; SSSE3-NEXT: andl $31, %r8d +; SSSE3-NEXT: movzbl 224(%rsp,%r8), %eax +; SSSE3-NEXT: movd %eax, %xmm11 +; SSSE3-NEXT: andl $31, %edi +; SSSE3-NEXT: movzbl 256(%rsp,%rdi), %eax ; SSSE3-NEXT: movd %eax, %xmm12 -; SSSE3-NEXT: andl $31, %r12d -; SSSE3-NEXT: movzbl 192(%rsp,%r12), %eax -; SSSE3-NEXT: movd %eax, %xmm5 -; SSSE3-NEXT: andl $31, %r15d -; SSSE3-NEXT: movzbl 224(%rsp,%r15), %eax +; SSSE3-NEXT: andl $31, %esi +; SSSE3-NEXT: movzbl 288(%rsp,%rsi), %eax ; SSSE3-NEXT: movd %eax, %xmm13 -; SSSE3-NEXT: andl $31, %r14d -; SSSE3-NEXT: movzbl 256(%rsp,%r14), %eax -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: andl $31, %r11d -; SSSE3-NEXT: movzbl 288(%rsp,%r11), %eax +; SSSE3-NEXT: andl $31, %edx +; SSSE3-NEXT: movzbl 320(%rsp,%rdx), %eax ; SSSE3-NEXT: movd %eax, %xmm14 -; SSSE3-NEXT: andl $31, %r10d -; SSSE3-NEXT: movzbl 320(%rsp,%r10), %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSSE3-NEXT: andl $31, %eax -; SSSE3-NEXT: movzbl 352(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: andl $31, %ecx +; SSSE3-NEXT: movzbl 352(%rsp,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm15 ; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSSE3-NEXT: andl $31, %eax ; SSSE3-NEXT: movzbl 384(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSSE3-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSSE3-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/var-permute-512.ll b/llvm/test/CodeGen/X86/var-permute-512.ll --- a/llvm/test/CodeGen/X86/var-permute-512.ll +++ b/llvm/test/CodeGen/X86/var-permute-512.ll @@ -1122,7 +1122,7 @@ ; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrd $3, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm8 +; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax @@ -1213,98 +1213,98 @@ ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm7, %xmm7 ; AVX512F-NEXT: vpextrd $3, %xmm2, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm8 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vmovd %xmm8, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $1, %xmm0, %eax +; AVX512F-NEXT: vpextrd $1, %xmm8, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $2, %xmm0, %eax +; AVX512F-NEXT: vpextrd $2, %xmm8, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $3, %xmm0, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm0 +; AVX512F-NEXT: vpextrd $3, %xmm8, %eax +; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm8 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vmovd %xmm8, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $1, %xmm0, %eax +; AVX512F-NEXT: vpextrd $1, %xmm8, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $2, %xmm0, %eax +; AVX512F-NEXT: vpextrd $2, %xmm8, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $3, %xmm0, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm0 +; AVX512F-NEXT: vpextrd $3, %xmm8, %eax +; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm2 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $11, %eax, %xmm7, %xmm2 -; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm7, %xmm7 +; AVX512F-NEXT: vmovd %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX512F-NEXT: vpextrd $1, %xmm0, %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm7, %xmm7 +; AVX512F-NEXT: vpextrd $1, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX512F-NEXT: vpextrd $2, %xmm0, %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm7, %xmm7 +; AVX512F-NEXT: vpextrd $2, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX512F-NEXT: vpextrd $3, %xmm0, %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm7, %xmm7 +; AVX512F-NEXT: vpextrd $3, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm7, %xmm2 ; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm2 +; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm3 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $11, %eax, %xmm6, %xmm3 -; AVX512F-NEXT: vmovd %xmm2, %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm6, %xmm5 +; AVX512F-NEXT: vmovd %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vpextrd $1, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $1, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vpextrd $2, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $2, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vpextrd $3, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $3, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2 +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3 ; AVX512F-NEXT: vpextrd $1, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm3 +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; AVX512F-NEXT: vpextrd $2, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 ; AVX512F-NEXT: vpextrd $3, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vcvtdq2ps %zmm0, %zmm0 +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 ; AVX512F-NEXT: vcvtdq2ps %zmm2, %zmm2 +; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbd %xmm8, %zmm3 -; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3 -; AVX512F-NEXT: vmovaps %zmm3, 192(%rdi) +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vcvtdq2ps %zmm0, %zmm0 +; AVX512F-NEXT: vmovaps %zmm0, 192(%rdi) ; AVX512F-NEXT: vmovaps %zmm1, 128(%rdi) -; AVX512F-NEXT: vmovaps %zmm2, 64(%rdi) -; AVX512F-NEXT: vmovaps %zmm0, (%rdi) +; AVX512F-NEXT: vmovaps %zmm3, 64(%rdi) +; AVX512F-NEXT: vmovaps %zmm2, (%rdi) ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp ; AVX512F-NEXT: vzeroupper @@ -1373,7 +1373,7 @@ ; AVX512BW-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm8 +; AVX512BW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax @@ -1464,98 +1464,98 @@ ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm7, %xmm7 ; AVX512BW-NEXT: vpextrd $3, %xmm2, %eax -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm8 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vmovd %xmm8, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $1, %xmm0, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm8, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $2, %xmm0, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm8, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $3, %xmm0, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm0 +; AVX512BW-NEXT: vpextrd $3, %xmm8, %eax +; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm8 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vmovd %xmm8, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $1, %xmm0, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm8, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $2, %xmm0, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm8, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $3, %xmm0, %eax -; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm0 +; AVX512BW-NEXT: vpextrd $3, %xmm8, %eax +; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm2 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm7, %xmm2 -; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm7, %xmm7 +; AVX512BW-NEXT: vmovd %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrd $1, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrd $1, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrd $2, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrd $2, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrd $3, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm7, %xmm7 +; AVX512BW-NEXT: vpextrd $3, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm7, %xmm2 ; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax -; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm2 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm3 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm6, %xmm3 -; AVX512BW-NEXT: vmovd %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm6, %xmm5 +; AVX512BW-NEXT: vmovd %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrd $1, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrd $2, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrd $3, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2 +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3 ; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm3 +; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1 -; AVX512BW-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512BW-NEXT: vcvtdq2ps %zmm0, %zmm0 +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 ; AVX512BW-NEXT: vpmovsxbd %xmm2, %zmm2 ; AVX512BW-NEXT: vcvtdq2ps %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512BW-NEXT: vcvtdq2ps %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512BW-NEXT: vcvtdq2ps %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovsxbd %xmm8, %zmm3 -; AVX512BW-NEXT: vcvtdq2ps %zmm3, %zmm3 -; AVX512BW-NEXT: vmovaps %zmm3, 192(%rdi) +; AVX512BW-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512BW-NEXT: vcvtdq2ps %zmm0, %zmm0 +; AVX512BW-NEXT: vmovaps %zmm0, 192(%rdi) ; AVX512BW-NEXT: vmovaps %zmm1, 128(%rdi) -; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdi) -; AVX512BW-NEXT: vmovaps %zmm0, (%rdi) +; AVX512BW-NEXT: vmovaps %zmm3, 64(%rdi) +; AVX512BW-NEXT: vmovaps %zmm2, (%rdi) ; AVX512BW-NEXT: movq %rbp, %rsp ; AVX512BW-NEXT: popq %rbp ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -4786,54 +4786,54 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovapd (%rdi), %ymm2 ; AVX1-NEXT: vmovapd 32(%rdi), %ymm3 -; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [1,1,1,1] -; AVX1-NEXT: vandpd %ymm3, %ymm8, %ymm5 -; AVX1-NEXT: vmovdqa (%rdi), %xmm9 +; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,1,1] +; AVX1-NEXT: vandpd %ymm4, %ymm3, %ymm5 +; AVX1-NEXT: vmovaps (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm6, %xmm7 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX1-NEXT: vpsrlq $1, %xmm4, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-NEXT: vorpd %ymm5, %ymm0, %ymm0 -; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm3, %ymm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm3 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm5 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX1-NEXT: vpsrlq $1, %xmm8, %xmm9 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-NEXT: vorpd %ymm5, %ymm7, %ymm5 +; AVX1-NEXT: vblendvpd %ymm3, %ymm5, %ymm3, %ymm3 +; AVX1-NEXT: vpextrq $1, %xmm3, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm5 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] -; AVX1-NEXT: vaddps %xmm0, %xmm0, %xmm3 -; AVX1-NEXT: vpackssdw %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vblendvps %xmm4, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vandpd %ymm2, %ymm8, %ymm3 -; AVX1-NEXT: vpsrlq $1, %xmm9, %xmm4 -; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-NEXT: vorpd %ymm3, %ymm4, %ymm3 -; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm2, %ymm2 -; AVX1-NEXT: vpextrq $1, %xmm2, %rax +; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm7 +; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm7 +; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3] +; AVX1-NEXT: vpextrq $1, %xmm3, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm3 -; AVX1-NEXT: vmovq %xmm2, %rax +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[0] +; AVX1-NEXT: vaddps %xmm3, %xmm3, %xmm5 +; AVX1-NEXT: vpackssdw %xmm8, %xmm6, %xmm6 +; AVX1-NEXT: vblendvps %xmm6, %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %ymm4, %ymm2, %ymm4 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm5 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-NEXT: vorpd %ymm4, %ymm5, %ymm4 +; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vpextrq $1, %xmm2, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-NEXT: vmovq %xmm2, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm5 +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vmovq %xmm2, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm5 +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0],xmm4[3] ; AVX1-NEXT: vpextrq $1, %xmm2, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] -; AVX1-NEXT: vaddps %xmm2, %xmm2, %xmm3 -; AVX1-NEXT: vpackssdw %xmm1, %xmm9, %xmm1 -; AVX1-NEXT: vblendvps %xmm1, %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[0] +; AVX1-NEXT: vaddps %xmm2, %xmm2, %xmm4 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm4, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_load_8i64_to_8f32: diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -455,8 +455,8 @@ ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm8 -; AVX1-NEXT: vpcmpgtd %xmm8, %xmm7, %xmm7 +; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7 ; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm7 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3 @@ -465,26 +465,26 @@ ; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 ; AVX1-NEXT: vpcmpgtd %xmm6, %xmm5, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 +; AVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm8, %xmm8 +; AVX1-NEXT: vpxor %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm5 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi) +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) ; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) ; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) @@ -1038,110 +1038,110 @@ define <2 x i32> @saddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind { ; SSE2-LABEL: saddo_v2i128: ; SSE2: # %bb.0: -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: addq %r8, %rdi ; SSE2-NEXT: adcq %r9, %rsi ; SSE2-NEXT: seto %r8b ; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: seto %al -; SSE2-NEXT: movzbl %al, %eax -; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movzbl %r8b, %eax -; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: seto %r9b +; SSE2-NEXT: movzbl %r9b, %r9d +; SSE2-NEXT: negl %r9d +; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: movzbl %r8b, %r8d +; SSE2-NEXT: negl %r8d +; SSE2-NEXT: movd %r8d, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %rdx, 16(%r10) -; SSE2-NEXT: movq %rdi, (%r10) -; SSE2-NEXT: movq %rcx, 24(%r10) -; SSE2-NEXT: movq %rsi, 8(%r10) +; SSE2-NEXT: movq %rdx, 16(%rax) +; SSE2-NEXT: movq %rdi, (%rax) +; SSE2-NEXT: movq %rcx, 24(%rax) +; SSE2-NEXT: movq %rsi, 8(%rax) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: saddo_v2i128: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSSE3-NEXT: addq %r8, %rdi ; SSSE3-NEXT: adcq %r9, %rsi ; SSSE3-NEXT: seto %r8b ; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; SSSE3-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: seto %al -; SSSE3-NEXT: movzbl %al, %eax -; SSSE3-NEXT: negl %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movzbl %r8b, %eax -; SSSE3-NEXT: negl %eax -; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: seto %r9b +; SSSE3-NEXT: movzbl %r9b, %r9d +; SSSE3-NEXT: negl %r9d +; SSSE3-NEXT: movd %r9d, %xmm1 +; SSSE3-NEXT: movzbl %r8b, %r8d +; SSSE3-NEXT: negl %r8d +; SSSE3-NEXT: movd %r8d, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %rdx, 16(%r10) -; SSSE3-NEXT: movq %rdi, (%r10) -; SSSE3-NEXT: movq %rcx, 24(%r10) -; SSSE3-NEXT: movq %rsi, 8(%r10) +; SSSE3-NEXT: movq %rdx, 16(%rax) +; SSSE3-NEXT: movq %rdi, (%rax) +; SSSE3-NEXT: movq %rcx, 24(%rax) +; SSSE3-NEXT: movq %rsi, 8(%rax) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: saddo_v2i128: ; SSE41: # %bb.0: -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE41-NEXT: addq %r8, %rdi ; SSE41-NEXT: adcq %r9, %rsi ; SSE41-NEXT: seto %r8b ; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; SSE41-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: seto %al -; SSE41-NEXT: movzbl %al, %r9d +; SSE41-NEXT: seto %r9b +; SSE41-NEXT: movzbl %r9b, %r9d ; SSE41-NEXT: negl %r9d -; SSE41-NEXT: movzbl %r8b, %eax -; SSE41-NEXT: negl %eax -; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: movzbl %r8b, %r8d +; SSE41-NEXT: negl %r8d +; SSE41-NEXT: movd %r8d, %xmm0 ; SSE41-NEXT: pinsrd $1, %r9d, %xmm0 -; SSE41-NEXT: movq %rdx, 16(%r10) -; SSE41-NEXT: movq %rdi, (%r10) -; SSE41-NEXT: movq %rcx, 24(%r10) -; SSE41-NEXT: movq %rsi, 8(%r10) +; SSE41-NEXT: movq %rdx, 16(%rax) +; SSE41-NEXT: movq %rdi, (%rax) +; SSE41-NEXT: movq %rcx, 24(%rax) +; SSE41-NEXT: movq %rsi, 8(%rax) ; SSE41-NEXT: retq ; ; AVX-LABEL: saddo_v2i128: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: addq %r8, %rdi ; AVX-NEXT: adcq %r9, %rsi ; AVX-NEXT: seto %r8b ; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: seto %al -; AVX-NEXT: movzbl %al, %r9d +; AVX-NEXT: seto %r9b +; AVX-NEXT: movzbl %r9b, %r9d ; AVX-NEXT: negl %r9d -; AVX-NEXT: movzbl %r8b, %eax -; AVX-NEXT: negl %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: movzbl %r8b, %r8d +; AVX-NEXT: negl %r8d +; AVX-NEXT: vmovd %r8d, %xmm0 ; AVX-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0 -; AVX-NEXT: movq %rdx, 16(%r10) -; AVX-NEXT: movq %rdi, (%r10) -; AVX-NEXT: movq %rcx, 24(%r10) -; AVX-NEXT: movq %rsi, 8(%r10) +; AVX-NEXT: movq %rdx, 16(%rax) +; AVX-NEXT: movq %rdi, (%rax) +; AVX-NEXT: movq %rcx, 24(%rax) +; AVX-NEXT: movq %rsi, 8(%rax) ; AVX-NEXT: retq ; ; AVX512-LABEL: saddo_v2i128: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: seto %al -; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: seto %r10b +; AVX512-NEXT: kmovd %r10d, %k0 ; AVX512-NEXT: addq %r8, %rdi ; AVX512-NEXT: adcq %r9, %rsi -; AVX512-NEXT: seto %al -; AVX512-NEXT: andl $1, %eax -; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: seto %r8b +; AVX512-NEXT: andl $1, %r8d +; AVX512-NEXT: kmovw %r8d, %k1 ; AVX512-NEXT: kshiftlw $1, %k0, %k0 ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: movq %rdx, 16(%r10) -; AVX512-NEXT: movq %rdi, (%r10) -; AVX512-NEXT: movq %rcx, 24(%r10) -; AVX512-NEXT: movq %rsi, 8(%r10) +; AVX512-NEXT: movq %rdx, 16(%rax) +; AVX512-NEXT: movq %rdi, (%rax) +; AVX512-NEXT: movq %rcx, 24(%rax) +; AVX512-NEXT: movq %rsi, 8(%rax) ; AVX512-NEXT: retq %t = call {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -436,58 +436,58 @@ ; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE2-NEXT: movd %r9d, %xmm10 +; SSE2-NEXT: movd %r9d, %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movdqa %xmm10, %xmm9 -; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE2-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: pmuludq %xmm6, %xmm10 -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE2-NEXT: pmuludq %xmm6, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pand %xmm2, %xmm8 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 +; SSE2-NEXT: pand %xmm5, %xmm9 +; SSE2-NEXT: paddd %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: psubd %xmm3, %xmm0 +; SSE2-NEXT: pmuludq %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; SSE2-NEXT: psubd %xmm9, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pand %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 ; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: paddd %xmm3, %xmm7 -; SSE2-NEXT: pmuludq %xmm8, %xmm1 +; SSE2-NEXT: paddd %xmm8, %xmm7 +; SSE2-NEXT: pmuludq %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE2-NEXT: psubd %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: movq %xmm3, 16(%rcx) -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: movq %xmm3, 16(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movq %xmm0, 16(%rcx) +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm5, %xmm0 +; SSE2-NEXT: movq %xmm0, 16(%rdi) ; SSE2-NEXT: movdqa %xmm2, (%rdi) ; SSE2-NEXT: retq ; @@ -508,58 +508,58 @@ ; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSSE3-NEXT: movd %r9d, %xmm10 +; SSSE3-NEXT: movd %r9d, %xmm0 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movdqa %xmm10, %xmm9 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSSE3-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: pmuludq %xmm6, %xmm10 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSSE3-NEXT: pmuludq %xmm6, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: paddd %xmm4, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pand %xmm2, %xmm8 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9 +; SSSE3-NEXT: pand %xmm5, %xmm9 +; SSSE3-NEXT: paddd %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm5, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,3,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: psubd %xmm3, %xmm0 +; SSSE3-NEXT: pmuludq %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; SSSE3-NEXT: psubd %xmm9, %xmm10 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSSE3-NEXT: movdqa %xmm2, (%rcx) ; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3 -; SSSE3-NEXT: pand %xmm9, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7 +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 +; SSSE3-NEXT: pxor %xmm5, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pand %xmm3, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 ; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: paddd %xmm3, %xmm7 -; SSSE3-NEXT: pmuludq %xmm8, %xmm1 +; SSSE3-NEXT: paddd %xmm8, %xmm7 +; SSSE3-NEXT: pmuludq %xmm4, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSSE3-NEXT: psubd %xmm7, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSSE3-NEXT: movq %xmm3, 16(%rcx) -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm3 -; SSSE3-NEXT: movq %xmm3, 16(%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movq %xmm0, 16(%rcx) +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm5, %xmm0 +; SSSE3-NEXT: movq %xmm0, 16(%rdi) ; SSSE3-NEXT: movdqa %xmm2, (%rdi) ; SSSE3-NEXT: retq ; @@ -892,84 +892,84 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm11, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] ; SSE2-NEXT: psubd %xmm10, %xmm9 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pxor %xmm9, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE2-NEXT: pand %xmm1, %xmm9 ; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm10 -; SSE2-NEXT: pand %xmm1, %xmm10 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: paddd %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; SSE2-NEXT: pcmpgtd %xmm1, %xmm10 +; SSE2-NEXT: pand %xmm5, %xmm10 +; SSE2-NEXT: paddd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm5, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm10, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] -; SSE2-NEXT: psubd %xmm4, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; SSE2-NEXT: psubd %xmm10, %xmm11 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE2-NEXT: movdqa %xmm1, 16(%rdi) ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: paddd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 +; SSE2-NEXT: pand %xmm6, %xmm9 +; SSE2-NEXT: paddd %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE2-NEXT: psubd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; SSE2-NEXT: psubd %xmm9, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE2-NEXT: movdqa %xmm2, 32(%rdi) ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 ; SSE2-NEXT: pand %xmm7, %xmm8 -; SSE2-NEXT: paddd %xmm4, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: paddd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE2-NEXT: psubd %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE2-NEXT: psubd %xmm8, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; SSE2-NEXT: movdqa %xmm3, 48(%rdi) ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE2-NEXT: pxor %xmm9, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: smulo_v16i32: @@ -985,84 +985,84 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm4, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm11, %xmm12 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm11, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] ; SSSE3-NEXT: psubd %xmm10, %xmm9 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSSE3-NEXT: movdqa %xmm0, (%rdi) ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9 -; SSSE3-NEXT: pxor %xmm9, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm9 +; SSSE3-NEXT: pand %xmm1, %xmm9 ; SSSE3-NEXT: pxor %xmm10, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 -; SSSE3-NEXT: pand %xmm1, %xmm10 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: paddd %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm10 +; SSSE3-NEXT: pand %xmm5, %xmm10 +; SSSE3-NEXT: paddd %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm5, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm10, %xmm12 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] -; SSSE3-NEXT: psubd %xmm4, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; SSSE3-NEXT: psubd %xmm10, %xmm11 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 -; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm1 ; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 -; SSSE3-NEXT: pand %xmm6, %xmm5 -; SSSE3-NEXT: paddd %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9 +; SSSE3-NEXT: pand %xmm6, %xmm9 +; SSSE3-NEXT: paddd %xmm5, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm6, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm10, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSSE3-NEXT: psubd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; SSSE3-NEXT: psubd %xmm9, %xmm10 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) ; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm9, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSSE3-NEXT: pand %xmm3, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 +; SSSE3-NEXT: pand %xmm3, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 ; SSSE3-NEXT: pand %xmm7, %xmm8 -; SSSE3-NEXT: paddd %xmm4, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: paddd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm7, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSSE3-NEXT: psubd %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSSE3-NEXT: psubd %xmm8, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; SSSE3-NEXT: movdqa %xmm3, 48(%rdi) ; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 -; SSSE3-NEXT: pxor %xmm9, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: smulo_v16i32: @@ -1078,44 +1078,44 @@ ; SSE41-NEXT: movdqa %xmm0, (%rdi) ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE41-NEXT: pxor %xmm8, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] -; SSE41-NEXT: pmuldq %xmm9, %xmm10 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pmuldq %xmm5, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3],xmm4[4,5],xmm10[6,7] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm8, %xmm9 +; SSE41-NEXT: movdqa %xmm1, %xmm8 +; SSE41-NEXT: pmuldq %xmm5, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5],xmm9[6,7] ; SSE41-NEXT: pmulld %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, 16(%rdi) ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm1 -; SSE41-NEXT: pxor %xmm8, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSE41-NEXT: pmuldq %xmm4, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pmuldq %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm5, %xmm8 +; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: pmuldq %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5],xmm8[6,7] ; SSE41-NEXT: pmulld %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm2, 32(%rdi) ; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm2 -; SSE41-NEXT: pxor %xmm8, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE41-NEXT: pmuldq %xmm4, %xmm5 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pmuldq %xmm7, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm5, %xmm6 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: pmuldq %xmm7, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] ; SSE41-NEXT: pmulld %xmm7, %xmm3 ; SSE41-NEXT: movdqa %xmm3, 48(%rdi) ; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm8, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: smulo_v16i32: @@ -1128,45 +1128,45 @@ ; AVX1-NEXT: vpmuldq %xmm4, %xmm6, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] -; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm8 -; AVX1-NEXT: vpsrad $31, %xmm8, %xmm6 +; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm4, %xmm6 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpxor %xmm6, %xmm9, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX1-NEXT: vpmuldq %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3],xmm7[4,5],xmm4[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmuldq %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7] ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpsrad $31, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm9, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; AVX1-NEXT: vpmuldq %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpmuldq %xmm4, %xmm7, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] -; AVX1-NEXT: vpmulld %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpsrad $31, %xmm4, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuldq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; AVX1-NEXT: vpmuldq %xmm7, %xmm9, %xmm7 +; AVX1-NEXT: vpmuldq %xmm6, %xmm8, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3],xmm9[4,5],xmm7[6,7] +; AVX1-NEXT: vpmulld %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vpsrad $31, %xmm6, %xmm8 +; AVX1-NEXT: vpcmpeqd %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuldq %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5],xmm8[6,7] ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpxor %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vpackssdw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] @@ -1177,9 +1177,9 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi) +; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) ; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) -; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) ; AVX1-NEXT: retq ; @@ -1477,44 +1477,44 @@ ; SSE2-NEXT: psrlw $8, %xmm5 ; SSE2-NEXT: pxor %xmm7, %xmm7 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; SSE2-NEXT: pmulhw %xmm7, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm7 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: pmulhw %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: psrlw $8, %xmm7 ; SSE2-NEXT: packuswb %xmm5, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm9, %xmm6 -; SSE2-NEXT: pand %xmm9, %xmm10 -; SSE2-NEXT: packuswb %xmm6, %xmm10 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm10, %xmm3 -; SSE2-NEXT: pcmpeqb %xmm7, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm3, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm7, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm6, %xmm1 ; SSE2-NEXT: pxor %xmm7, %xmm7 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT: pmulhw %xmm7, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; SSE2-NEXT: pmulhw %xmm7, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm7 ; SSE2-NEXT: psrlw $8, %xmm7 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT: pmulhw %xmm5, %xmm11 -; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: pmulhw %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: packuswb %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm9, %xmm6 -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: packuswb %xmm6, %xmm11 -; SSE2-NEXT: pcmpgtb %xmm11, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm8 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: packuswb %xmm8, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm4 @@ -1527,30 +1527,30 @@ ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm7 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm7 ; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm9 +; SSE2-NEXT: psrad $31, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $24, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE2-NEXT: psrad $24, %xmm2 -; SSE2-NEXT: movdqa %xmm10, 16(%rsi) -; SSE2-NEXT: movdqa %xmm11, (%rsi) -; SSE2-NEXT: movdqa %xmm2, 64(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm8 +; SSE2-NEXT: movdqa %xmm3, 16(%rsi) +; SSE2-NEXT: movdqa %xmm2, (%rsi) +; SSE2-NEXT: movdqa %xmm8, 64(%rdi) ; SSE2-NEXT: movdqa %xmm5, (%rdi) -; SSE2-NEXT: movdqa %xmm3, 112(%rdi) -; SSE2-NEXT: movdqa %xmm1, 96(%rdi) +; SSE2-NEXT: movdqa %xmm1, 112(%rdi) +; SSE2-NEXT: movdqa %xmm9, 96(%rdi) ; SSE2-NEXT: movdqa %xmm7, 80(%rdi) ; SSE2-NEXT: movdqa %xmm0, 48(%rdi) ; SSE2-NEXT: movdqa %xmm6, 32(%rdi) @@ -1570,44 +1570,44 @@ ; SSSE3-NEXT: psrlw $8, %xmm5 ; SSSE3-NEXT: pxor %xmm7, %xmm7 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; SSSE3-NEXT: pxor %xmm10, %xmm10 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; SSSE3-NEXT: pmulhw %xmm7, %xmm10 -; SSSE3-NEXT: movdqa %xmm10, %xmm7 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSSE3-NEXT: pmulhw %xmm7, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm7 ; SSSE3-NEXT: psrlw $8, %xmm7 ; SSSE3-NEXT: packuswb %xmm5, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] -; SSSE3-NEXT: pand %xmm9, %xmm6 -; SSSE3-NEXT: pand %xmm9, %xmm10 -; SSSE3-NEXT: packuswb %xmm6, %xmm10 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpgtb %xmm10, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm7, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSSE3-NEXT: pxor %xmm8, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSSE3-NEXT: pand %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: packuswb %xmm6, %xmm3 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm7, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 +; SSSE3-NEXT: pxor %xmm6, %xmm1 ; SSSE3-NEXT: pxor %xmm7, %xmm7 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSSE3-NEXT: pmulhw %xmm7, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; SSSE3-NEXT: pmulhw %xmm7, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm7 ; SSSE3-NEXT: psrlw $8, %xmm7 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSSE3-NEXT: pxor %xmm11, %xmm11 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSSE3-NEXT: pmulhw %xmm5, %xmm11 -; SSSE3-NEXT: movdqa %xmm11, %xmm0 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: pmulhw %xmm9, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm0 ; SSSE3-NEXT: packuswb %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm9, %xmm6 -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: packuswb %xmm6, %xmm11 -; SSSE3-NEXT: pcmpgtb %xmm11, %xmm4 +; SSSE3-NEXT: pand %xmm5, %xmm8 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: packuswb %xmm8, %xmm2 +; SSSE3-NEXT: pcmpgtb %xmm2, %xmm4 ; SSSE3-NEXT: pcmpeqb %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: pxor %xmm6, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm4 @@ -1620,30 +1620,30 @@ ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm7 +; SSSE3-NEXT: movdqa %xmm1, %xmm7 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm7 ; SSSE3-NEXT: psrad $31, %xmm7 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: movdqa %xmm1, %xmm9 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm9 +; SSSE3-NEXT: psrad $31, %xmm9 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: psrad $24, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSSE3-NEXT: psrad $24, %xmm2 -; SSSE3-NEXT: movdqa %xmm10, 16(%rsi) -; SSSE3-NEXT: movdqa %xmm11, (%rsi) -; SSSE3-NEXT: movdqa %xmm2, 64(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm8 +; SSSE3-NEXT: movdqa %xmm3, 16(%rsi) +; SSSE3-NEXT: movdqa %xmm2, (%rsi) +; SSSE3-NEXT: movdqa %xmm8, 64(%rdi) ; SSSE3-NEXT: movdqa %xmm5, (%rdi) -; SSSE3-NEXT: movdqa %xmm3, 112(%rdi) -; SSSE3-NEXT: movdqa %xmm1, 96(%rdi) +; SSSE3-NEXT: movdqa %xmm1, 112(%rdi) +; SSSE3-NEXT: movdqa %xmm9, 96(%rdi) ; SSSE3-NEXT: movdqa %xmm7, 80(%rdi) ; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) ; SSSE3-NEXT: movdqa %xmm6, 32(%rdi) @@ -1653,7 +1653,7 @@ ; SSE41-LABEL: smulo_v32i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: pxor %xmm10, %xmm10 +; SSE41-NEXT: pxor %xmm4, %xmm4 ; SSE41-NEXT: pxor %xmm5, %xmm5 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; SSE41-NEXT: pxor %xmm6, %xmm6 @@ -1669,45 +1669,45 @@ ; SSE41-NEXT: movdqa %xmm3, %xmm7 ; SSE41-NEXT: psrlw $8, %xmm7 ; SSE41-NEXT: packuswb %xmm5, %xmm7 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm9, %xmm6 -; SSE41-NEXT: pand %xmm9, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pand %xmm5, %xmm3 ; SSE41-NEXT: packuswb %xmm6, %xmm3 ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pcmpgtb %xmm3, %xmm1 ; SSE41-NEXT: pcmpeqb %xmm7, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE41-NEXT: pxor %xmm8, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE41-NEXT: pxor %xmm6, %xmm1 ; SSE41-NEXT: pxor %xmm7, %xmm7 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE41-NEXT: pmulhw %xmm7, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pxor %xmm8, %xmm8 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; SSE41-NEXT: pmulhw %xmm7, %xmm8 +; SSE41-NEXT: movdqa %xmm8, %xmm7 ; SSE41-NEXT: psrlw $8, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE41-NEXT: pxor %xmm9, %xmm9 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE41-NEXT: pmulhw %xmm5, %xmm2 +; SSE41-NEXT: pmulhw %xmm9, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: packuswb %xmm7, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm6 -; SSE41-NEXT: pand %xmm9, %xmm2 -; SSE41-NEXT: packuswb %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtb %xmm2, %xmm10 -; SSE41-NEXT: pcmpeqb %xmm0, %xmm10 -; SSE41-NEXT: pxor %xmm8, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm8 -; SSE41-NEXT: psrad $31, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE41-NEXT: pand %xmm5, %xmm8 +; SSE41-NEXT: pand %xmm5, %xmm2 +; SSE41-NEXT: packuswb %xmm8, %xmm2 +; SSE41-NEXT: pcmpgtb %xmm2, %xmm4 +; SSE41-NEXT: pcmpeqb %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm6, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm5 ; SSE41-NEXT: psrad $31, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm6 ; SSE41-NEXT: psrad $31, %xmm6 @@ -1715,26 +1715,26 @@ ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm7 ; SSE41-NEXT: psrad $31, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[3,3,3,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm4 -; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: pmovsxbd %xmm10, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm8 +; SSE41-NEXT: psrad $31, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm9 +; SSE41-NEXT: psrad $31, %xmm9 +; SSE41-NEXT: pmovsxbd %xmm4, %xmm4 ; SSE41-NEXT: pmovsxbd %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm3, 16(%rsi) ; SSE41-NEXT: movdqa %xmm2, (%rsi) ; SSE41-NEXT: movdqa %xmm1, 64(%rdi) -; SSE41-NEXT: movdqa %xmm9, (%rdi) -; SSE41-NEXT: movdqa %xmm4, 112(%rdi) -; SSE41-NEXT: movdqa %xmm0, 96(%rdi) +; SSE41-NEXT: movdqa %xmm4, (%rdi) +; SSE41-NEXT: movdqa %xmm9, 112(%rdi) +; SSE41-NEXT: movdqa %xmm8, 96(%rdi) ; SSE41-NEXT: movdqa %xmm7, 80(%rdi) ; SSE41-NEXT: movdqa %xmm6, 48(%rdi) ; SSE41-NEXT: movdqa %xmm5, 32(%rdi) -; SSE41-NEXT: movdqa %xmm8, 16(%rdi) +; SSE41-NEXT: movdqa %xmm0, 16(%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: smulo_v32i8: @@ -1757,43 +1757,43 @@ ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpxor %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-NEXT: vpmulhw %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm7 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX1-NEXT: vpmulhw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm8 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX1-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpackuswb %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm5 -; AVX1-NEXT: vpcmpgtb %xmm5, %xmm2, %xmm0 +; AVX1-NEXT: vpackuswb %xmm7, %xmm0, %xmm6 +; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm0, %xmm8, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi) -; AVX1-NEXT: vmovdqa %xmm5, (%rdi) +; AVX1-NEXT: vmovdqa %xmm6, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: smulo_v32i8: @@ -1891,94 +1891,94 @@ ; SSE2-NEXT: psrlw $8, %xmm8 ; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; SSE2-NEXT: pmulhw %xmm10, %xmm11 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: psrlw $8, %xmm7 -; SSE2-NEXT: packuswb %xmm8, %xmm7 +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE2-NEXT: pmulhw %xmm10, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: psrlw $8, %xmm10 +; SSE2-NEXT: packuswb %xmm8, %xmm10 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm8, %xmm9 -; SSE2-NEXT: pand %xmm8, %xmm11 -; SSE2-NEXT: packuswb %xmm9, %xmm11 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpgtb %xmm11, %xmm9 -; SSE2-NEXT: pcmpeqb %xmm7, %xmm9 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: packuswb %xmm9, %xmm7 ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE2-NEXT: pmulhw %xmm7, %xmm3 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE2-NEXT: pcmpgtb %xmm7, %xmm3 +; SSE2-NEXT: pcmpeqb %xmm10, %xmm3 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] +; SSE2-NEXT: pmulhw %xmm9, %xmm10 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm10, %xmm2 ; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: pmulhw %xmm7, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: psrlw $8, %xmm7 -; SSE2-NEXT: packuswb %xmm2, %xmm7 -; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pmulhw %xmm9, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: psrlw $8, %xmm9 +; SSE2-NEXT: packuswb %xmm2, %xmm9 +; SSE2-NEXT: pand %xmm8, %xmm10 ; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: packuswb %xmm3, %xmm6 +; SSE2-NEXT: packuswb %xmm10, %xmm6 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] -; SSE2-NEXT: pmulhw %xmm3, %xmm7 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: pcmpeqb %xmm9, %xmm2 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] +; SSE2-NEXT: pmulhw %xmm9, %xmm10 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] ; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pmulhw %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: psrlw $8, %xmm3 -; SSE2-NEXT: packuswb %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pmulhw %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: psrlw $8, %xmm9 +; SSE2-NEXT: packuswb %xmm1, %xmm9 +; SSE2-NEXT: pand %xmm8, %xmm10 ; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: packuswb %xmm7, %xmm5 +; SSE2-NEXT: packuswb %xmm10, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE2-NEXT: pmulhw %xmm3, %xmm7 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: pcmpeqb %xmm9, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] +; SSE2-NEXT: pmulhw %xmm9, %xmm10 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] ; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm10, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pmulhw %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: psrlw $8, %xmm3 -; SSE2-NEXT: packuswb %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pmulhw %xmm9, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: psrlw $8, %xmm9 +; SSE2-NEXT: packuswb %xmm0, %xmm9 +; SSE2-NEXT: pand %xmm8, %xmm10 ; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: packuswb %xmm7, %xmm4 +; SSE2-NEXT: packuswb %xmm10, %xmm4 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpgtb %xmm4, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm3, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm11, 48(%rsi) +; SSE2-NEXT: pcmpeqb %xmm9, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm7, 48(%rsi) ; SSE2-NEXT: movdqa %xmm6, 32(%rsi) ; SSE2-NEXT: movdqa %xmm5, 16(%rsi) -; SSE2-NEXT: movdqa %xmm9, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: movdqa %xmm4, (%rsi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $24, %xmm4 ; SSE2-NEXT: movdqa %xmm4, 192(%rdi) @@ -1994,31 +1994,31 @@ ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $24, %xmm4 ; SSE2-NEXT: movdqa %xmm4, (%rdi) -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 224(%rdi) +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm5 +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: movdqa %xmm5, 224(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: movdqa %xmm4, 240(%rdi) -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm9 -; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: movdqa %xmm9, 208(%rdi) -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 160(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm3, 208(%rdi) +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 176(%rdi) +; SSE2-NEXT: movdqa %xmm4, 160(%rdi) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: movdqa %xmm3, 176(%rdi) ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -2070,94 +2070,94 @@ ; SSSE3-NEXT: psrlw $8, %xmm8 ; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; SSSE3-NEXT: pxor %xmm11, %xmm11 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; SSSE3-NEXT: pmulhw %xmm10, %xmm11 -; SSSE3-NEXT: movdqa %xmm11, %xmm7 -; SSSE3-NEXT: psrlw $8, %xmm7 -; SSSE3-NEXT: packuswb %xmm8, %xmm7 +; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSSE3-NEXT: pmulhw %xmm10, %xmm7 +; SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSSE3-NEXT: psrlw $8, %xmm10 +; SSSE3-NEXT: packuswb %xmm8, %xmm10 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; SSSE3-NEXT: pand %xmm8, %xmm9 -; SSSE3-NEXT: pand %xmm8, %xmm11 -; SSSE3-NEXT: packuswb %xmm9, %xmm11 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpgtb %xmm11, %xmm9 -; SSSE3-NEXT: pcmpeqb %xmm7, %xmm9 -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: packuswb %xmm9, %xmm7 ; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSSE3-NEXT: pmulhw %xmm7, %xmm3 -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSSE3-NEXT: pcmpgtb %xmm7, %xmm3 +; SSSE3-NEXT: pcmpeqb %xmm10, %xmm3 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; SSSE3-NEXT: pxor %xmm10, %xmm10 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] +; SSSE3-NEXT: pmulhw %xmm9, %xmm10 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] ; SSSE3-NEXT: pxor %xmm6, %xmm6 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm10, %xmm2 ; SSSE3-NEXT: psrlw $8, %xmm2 -; SSSE3-NEXT: pmulhw %xmm7, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: psrlw $8, %xmm7 -; SSSE3-NEXT: packuswb %xmm2, %xmm7 -; SSSE3-NEXT: pand %xmm8, %xmm3 +; SSSE3-NEXT: pmulhw %xmm9, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: psrlw $8, %xmm9 +; SSSE3-NEXT: packuswb %xmm2, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm10 ; SSSE3-NEXT: pand %xmm8, %xmm6 -; SSSE3-NEXT: packuswb %xmm3, %xmm6 +; SSSE3-NEXT: packuswb %xmm10, %xmm6 ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pcmpgtb %xmm6, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm7, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] -; SSSE3-NEXT: pmulhw %xmm3, %xmm7 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSSE3-NEXT: pcmpeqb %xmm9, %xmm2 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSSE3-NEXT: pxor %xmm10, %xmm10 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] +; SSSE3-NEXT: pmulhw %xmm9, %xmm10 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] ; SSSE3-NEXT: pxor %xmm5, %xmm5 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSSE3-NEXT: movdqa %xmm7, %xmm1 +; SSSE3-NEXT: movdqa %xmm10, %xmm1 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: pmulhw %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm3 -; SSSE3-NEXT: psrlw $8, %xmm3 -; SSSE3-NEXT: packuswb %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: pmulhw %xmm9, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: psrlw $8, %xmm9 +; SSSE3-NEXT: packuswb %xmm1, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm10 ; SSSE3-NEXT: pand %xmm8, %xmm5 -; SSSE3-NEXT: packuswb %xmm7, %xmm5 +; SSSE3-NEXT: packuswb %xmm10, %xmm5 ; SSSE3-NEXT: pxor %xmm1, %xmm1 ; SSSE3-NEXT: pcmpgtb %xmm5, %xmm1 -; SSSE3-NEXT: pcmpeqb %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSSE3-NEXT: pmulhw %xmm3, %xmm7 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: pcmpeqb %xmm9, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; SSSE3-NEXT: pxor %xmm10, %xmm10 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] +; SSSE3-NEXT: pmulhw %xmm9, %xmm10 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] ; SSSE3-NEXT: pxor %xmm4, %xmm4 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSSE3-NEXT: movdqa %xmm7, %xmm0 +; SSSE3-NEXT: movdqa %xmm10, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pmulhw %xmm3, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm3 -; SSSE3-NEXT: psrlw $8, %xmm3 -; SSSE3-NEXT: packuswb %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: pmulhw %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm9 +; SSSE3-NEXT: psrlw $8, %xmm9 +; SSSE3-NEXT: packuswb %xmm0, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm10 ; SSSE3-NEXT: pand %xmm8, %xmm4 -; SSSE3-NEXT: packuswb %xmm7, %xmm4 +; SSSE3-NEXT: packuswb %xmm10, %xmm4 ; SSSE3-NEXT: pxor %xmm0, %xmm0 ; SSSE3-NEXT: pcmpgtb %xmm4, %xmm0 -; SSSE3-NEXT: pcmpeqb %xmm3, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSSE3-NEXT: pxor %xmm3, %xmm9 -; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm11, 48(%rsi) +; SSSE3-NEXT: pcmpeqb %xmm9, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8 +; SSSE3-NEXT: pxor %xmm8, %xmm3 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm7, 48(%rsi) ; SSSE3-NEXT: movdqa %xmm6, 32(%rsi) ; SSSE3-NEXT: movdqa %xmm5, 16(%rsi) -; SSSE3-NEXT: movdqa %xmm9, %xmm3 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: movdqa %xmm4, (%rsi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: psrad $24, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, 192(%rdi) @@ -2173,31 +2173,31 @@ ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: psrad $24, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, (%rdi) -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 224(%rdi) +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm5 +; SSSE3-NEXT: psrad $31, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, 224(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, 240(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm9 -; SSSE3-NEXT: psrad $31, %xmm9 -; SSSE3-NEXT: movdqa %xmm9, 208(%rdi) -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 160(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm3, 208(%rdi) +; SSSE3-NEXT: movdqa %xmm4, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 176(%rdi) +; SSSE3-NEXT: movdqa %xmm4, 160(%rdi) +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, 176(%rdi) ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -2249,110 +2249,110 @@ ; SSE41-NEXT: psrlw $8, %xmm8 ; SSE41-NEXT: pxor %xmm10, %xmm10 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; SSE41-NEXT: pxor %xmm11, %xmm11 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; SSE41-NEXT: pmulhw %xmm10, %xmm11 -; SSE41-NEXT: movdqa %xmm11, %xmm7 -; SSE41-NEXT: psrlw $8, %xmm7 -; SSE41-NEXT: packuswb %xmm8, %xmm7 +; SSE41-NEXT: pxor %xmm7, %xmm7 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE41-NEXT: pmulhw %xmm10, %xmm7 +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: psrlw $8, %xmm10 +; SSE41-NEXT: packuswb %xmm8, %xmm10 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm8, %xmm9 -; SSE41-NEXT: pand %xmm8, %xmm11 -; SSE41-NEXT: packuswb %xmm9, %xmm11 -; SSE41-NEXT: pxor %xmm9, %xmm9 -; SSE41-NEXT: pcmpgtb %xmm11, %xmm9 -; SSE41-NEXT: pcmpeqb %xmm7, %xmm9 -; SSE41-NEXT: pxor %xmm7, %xmm7 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; SSE41-NEXT: pand %xmm8, %xmm7 +; SSE41-NEXT: packuswb %xmm9, %xmm7 ; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE41-NEXT: pmulhw %xmm7, %xmm3 -; SSE41-NEXT: pxor %xmm7, %xmm7 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE41-NEXT: pcmpgtb %xmm7, %xmm3 +; SSE41-NEXT: pcmpeqb %xmm10, %xmm3 +; SSE41-NEXT: pxor %xmm9, %xmm9 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; SSE41-NEXT: pxor %xmm10, %xmm10 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] +; SSE41-NEXT: pmulhw %xmm9, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm9 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] ; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; SSE41-NEXT: movdqa %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm10, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: pmulhw %xmm7, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: psrlw $8, %xmm7 -; SSE41-NEXT: packuswb %xmm2, %xmm7 -; SSE41-NEXT: pand %xmm8, %xmm3 +; SSE41-NEXT: pmulhw %xmm9, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: psrlw $8, %xmm9 +; SSE41-NEXT: packuswb %xmm2, %xmm9 +; SSE41-NEXT: pand %xmm8, %xmm10 ; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: packuswb %xmm3, %xmm6 +; SSE41-NEXT: packuswb %xmm10, %xmm6 ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pcmpgtb %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqb %xmm7, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; SSE41-NEXT: pxor %xmm7, %xmm7 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] -; SSE41-NEXT: pmulhw %xmm3, %xmm7 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE41-NEXT: pcmpeqb %xmm9, %xmm2 +; SSE41-NEXT: pxor %xmm9, %xmm9 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSE41-NEXT: pxor %xmm10, %xmm10 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] +; SSE41-NEXT: pmulhw %xmm9, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm9 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] ; SSE41-NEXT: pxor %xmm5, %xmm5 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE41-NEXT: movdqa %xmm7, %xmm1 +; SSE41-NEXT: movdqa %xmm10, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmulhw %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: packuswb %xmm1, %xmm3 -; SSE41-NEXT: pand %xmm8, %xmm7 +; SSE41-NEXT: pmulhw %xmm9, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: psrlw $8, %xmm9 +; SSE41-NEXT: packuswb %xmm1, %xmm9 +; SSE41-NEXT: pand %xmm8, %xmm10 ; SSE41-NEXT: pand %xmm8, %xmm5 -; SSE41-NEXT: packuswb %xmm7, %xmm5 +; SSE41-NEXT: packuswb %xmm10, %xmm5 ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pcmpgtb %xmm5, %xmm1 -; SSE41-NEXT: pcmpeqb %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE41-NEXT: pxor %xmm7, %xmm7 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE41-NEXT: pmulhw %xmm3, %xmm7 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE41-NEXT: pcmpeqb %xmm9, %xmm1 +; SSE41-NEXT: pxor %xmm9, %xmm9 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; SSE41-NEXT: pxor %xmm10, %xmm10 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] +; SSE41-NEXT: pmulhw %xmm9, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm9 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] ; SSE41-NEXT: pxor %xmm4, %xmm4 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pmulhw %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: packuswb %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm8, %xmm7 +; SSE41-NEXT: pmulhw %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm9 +; SSE41-NEXT: psrlw $8, %xmm9 +; SSE41-NEXT: packuswb %xmm0, %xmm9 +; SSE41-NEXT: pand %xmm8, %xmm10 ; SSE41-NEXT: pand %xmm8, %xmm4 -; SSE41-NEXT: packuswb %xmm7, %xmm4 +; SSE41-NEXT: packuswb %xmm10, %xmm4 ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pcmpgtb %xmm4, %xmm0 -; SSE41-NEXT: pcmpeqb %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE41-NEXT: pxor %xmm3, %xmm9 -; SSE41-NEXT: pxor %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm11, 48(%rsi) +; SSE41-NEXT: pcmpeqb %xmm9, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm8 +; SSE41-NEXT: pxor %xmm8, %xmm3 +; SSE41-NEXT: pxor %xmm8, %xmm2 +; SSE41-NEXT: pxor %xmm8, %xmm1 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm7, 48(%rsi) ; SSE41-NEXT: movdqa %xmm6, 32(%rsi) ; SSE41-NEXT: movdqa %xmm5, 16(%rsi) ; SSE41-NEXT: movdqa %xmm4, (%rsi) -; SSE41-NEXT: pmovsxbd %xmm9, %xmm3 -; SSE41-NEXT: movdqa %xmm3, 192(%rdi) -; SSE41-NEXT: pmovsxbd %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, 128(%rdi) -; SSE41-NEXT: pmovsxbd %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm3, 64(%rdi) -; SSE41-NEXT: pmovsxbd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm3, (%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm3 -; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: movdqa %xmm3, 224(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm9[3,3,3,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm3 -; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: movdqa %xmm3, 240(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] +; SSE41-NEXT: pmovsxbd %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm4, 192(%rdi) +; SSE41-NEXT: pmovsxbd %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm4, 128(%rdi) +; SSE41-NEXT: pmovsxbd %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm4, 64(%rdi) +; SSE41-NEXT: pmovsxbd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm4, (%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: movdqa %xmm4, 224(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: movdqa %xmm4, 240(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 @@ -2409,75 +2409,75 @@ ; AVX1-NEXT: movq %rdi, %rax ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; AVX1-NEXT: vpmulhw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm8 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; AVX1-NEXT: vpmulhw %xmm6, %xmm8, %xmm8 +; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm6 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX1-NEXT: vpmulhw %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm4 +; AVX1-NEXT: vpmulhw %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm7 +; AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm8 +; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpackuswb %xmm8, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm10, %xmm6 -; AVX1-NEXT: vpand %xmm7, %xmm10, %xmm7 -; AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm9 -; AVX1-NEXT: vpcmpgtb %xmm9, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpeqb %xmm4, %xmm6, %xmm8 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; AVX1-NEXT: vpmulhw %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm6 +; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm8 +; AVX1-NEXT: vpcmpeqb %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; AVX1-NEXT: vpmulhw %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm9 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; AVX1-NEXT: vpmulhw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm3 -; AVX1-NEXT: vpackuswb %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm10, %xmm1 -; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm11 -; AVX1-NEXT: vpcmpgtb %xmm11, %xmm5, %xmm4 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm4, %xmm12 +; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm8 +; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm5, %xmm8 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm8, %xmm8 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX1-NEXT: vpmulhw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] +; AVX1-NEXT: vpmulhw %xmm9, %xmm11, %xmm9 +; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm11 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-NEXT: vpmulhw %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm6 -; AVX1-NEXT: vpackuswb %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm10, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm10, %xmm3 -; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtb %xmm3, %xmm5, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; AVX1-NEXT: vpmulhw %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm6 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; AVX1-NEXT: vpmulhw %xmm3, %xmm10, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm10 +; AVX1-NEXT: vpackuswb %xmm11, %xmm10, %xmm10 +; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm9 +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtb %xmm3, %xmm5, %xmm9 +; AVX1-NEXT: vpcmpeqb %xmm10, %xmm9, %xmm10 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX1-NEXT: vpmulhw %xmm9, %xmm11, %xmm9 +; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm11 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 -; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm4 -; AVX1-NEXT: vpand %xmm0, %xmm10, %xmm0 -; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm0 +; AVX1-NEXT: vpackuswb %xmm11, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm9 +; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm9, %xmm0, %xmm9 +; AVX1-NEXT: vpcmpgtb %xmm9, %xmm5, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpxor %xmm7, %xmm8, %xmm6 -; AVX1-NEXT: vpxor %xmm7, %xmm12, %xmm5 -; AVX1-NEXT: vpxor %xmm7, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm9, 48(%rsi) -; AVX1-NEXT: vmovdqa %xmm11, 32(%rsi) +; AVX1-NEXT: vpcmpeqd %xmm11, %xmm11, %xmm11 +; AVX1-NEXT: vpxor %xmm7, %xmm11, %xmm6 +; AVX1-NEXT: vpxor %xmm11, %xmm8, %xmm5 +; AVX1-NEXT: vpxor %xmm11, %xmm10, %xmm2 +; AVX1-NEXT: vpxor %xmm0, %xmm11, %xmm0 +; AVX1-NEXT: vmovdqa %xmm4, 48(%rsi) +; AVX1-NEXT: vmovdqa %xmm1, 32(%rsi) ; AVX1-NEXT: vmovdqa %xmm3, 16(%rsi) -; AVX1-NEXT: vmovdqa %xmm4, (%rsi) +; AVX1-NEXT: vmovdqa %xmm9, (%rsi) ; AVX1-NEXT: vpmovsxbd %xmm6, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 192(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm5, %xmm1 @@ -2562,29 +2562,29 @@ ; AVX2-NEXT: vpcmpeqb %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpxor %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] -; AVX2-NEXT: vpmovsxbd %xmm4, %ymm8 +; AVX2-NEXT: vpmovsxbd %xmm4, %ymm4 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; AVX2-NEXT: vpmovsxbd %xmm6, %ymm9 +; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] -; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm9, %ymm9 ; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 ; AVX2-NEXT: vpmovsxbd %xmm5, %ymm5 ; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 -; AVX2-NEXT: vpmovsxbd %xmm4, %ymm4 +; AVX2-NEXT: vpmovsxbd %xmm8, %ymm8 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rsi) ; AVX2-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-NEXT: vmovdqa %ymm4, 192(%rdi) +; AVX2-NEXT: vmovdqa %ymm8, 192(%rdi) ; AVX2-NEXT: vmovdqa %ymm3, 128(%rdi) ; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi) ; AVX2-NEXT: vmovdqa %ymm2, (%rdi) -; AVX2-NEXT: vmovdqa %ymm6, 224(%rdi) +; AVX2-NEXT: vmovdqa %ymm9, 224(%rdi) ; AVX2-NEXT: vmovdqa %ymm7, 160(%rdi) -; AVX2-NEXT: vmovdqa %ymm9, 96(%rdi) -; AVX2-NEXT: vmovdqa %ymm8, 32(%rdi) +; AVX2-NEXT: vmovdqa %ymm6, 96(%rdi) +; AVX2-NEXT: vmovdqa %ymm4, 32(%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2796,23 +2796,23 @@ ; SSE2-LABEL: smulo_v2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %r8 +; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %rcx ; SSE2-NEXT: movq %xmm1, %rdx ; SSE2-NEXT: movq %xmm0, %rsi -; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: xorl %r8d, %r8d ; SSE2-NEXT: imulq %rdx, %rsi -; SSE2-NEXT: movq $-1, %r9 -; SSE2-NEXT: movl $0, %edx -; SSE2-NEXT: cmovoq %r9, %rdx +; SSE2-NEXT: movq $-1, %rdx +; SSE2-NEXT: movl $0, %r9d +; SSE2-NEXT: cmovoq %rdx, %r9 ; SSE2-NEXT: movq %rsi, %xmm1 -; SSE2-NEXT: imulq %r8, %rcx +; SSE2-NEXT: imulq %rax, %rcx ; SSE2-NEXT: movq %rcx, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movq %rdx, %xmm0 -; SSE2-NEXT: cmovoq %r9, %rax -; SSE2-NEXT: movq %rax, %xmm2 +; SSE2-NEXT: movq %r9, %xmm0 +; SSE2-NEXT: cmovoq %rdx, %r8 +; SSE2-NEXT: movq %r8, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: movdqa %xmm1, (%rdi) @@ -2821,23 +2821,23 @@ ; SSSE3-LABEL: smulo_v2i64: ; SSSE3: # %bb.0: ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSSE3-NEXT: movq %xmm2, %r8 +; SSSE3-NEXT: movq %xmm2, %rax ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSSE3-NEXT: movq %xmm2, %rcx ; SSSE3-NEXT: movq %xmm1, %rdx ; SSSE3-NEXT: movq %xmm0, %rsi -; SSSE3-NEXT: xorl %eax, %eax +; SSSE3-NEXT: xorl %r8d, %r8d ; SSSE3-NEXT: imulq %rdx, %rsi -; SSSE3-NEXT: movq $-1, %r9 -; SSSE3-NEXT: movl $0, %edx -; SSSE3-NEXT: cmovoq %r9, %rdx +; SSSE3-NEXT: movq $-1, %rdx +; SSSE3-NEXT: movl $0, %r9d +; SSSE3-NEXT: cmovoq %rdx, %r9 ; SSSE3-NEXT: movq %rsi, %xmm1 -; SSSE3-NEXT: imulq %r8, %rcx +; SSSE3-NEXT: imulq %rax, %rcx ; SSSE3-NEXT: movq %rcx, %xmm0 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSSE3-NEXT: movq %rdx, %xmm0 -; SSSE3-NEXT: cmovoq %r9, %rax -; SSSE3-NEXT: movq %rax, %xmm2 +; SSSE3-NEXT: movq %r9, %xmm0 +; SSSE3-NEXT: cmovoq %rdx, %r8 +; SSSE3-NEXT: movq %r8, %xmm2 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: movdqa %xmm1, (%rdi) @@ -2845,22 +2845,22 @@ ; ; SSE41-LABEL: smulo_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %xmm1, %r8 +; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: movq %xmm0, %rcx ; SSE41-NEXT: pextrq $1, %xmm1, %rdx ; SSE41-NEXT: pextrq $1, %xmm0, %rsi -; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: xorl %r8d, %r8d ; SSE41-NEXT: imulq %rdx, %rsi -; SSE41-NEXT: movq $-1, %r9 -; SSE41-NEXT: movl $0, %edx -; SSE41-NEXT: cmovoq %r9, %rdx +; SSE41-NEXT: movq $-1, %rdx +; SSE41-NEXT: movl $0, %r9d +; SSE41-NEXT: cmovoq %rdx, %r9 ; SSE41-NEXT: movq %rsi, %xmm0 -; SSE41-NEXT: imulq %r8, %rcx +; SSE41-NEXT: imulq %rax, %rcx ; SSE41-NEXT: movq %rcx, %xmm1 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE41-NEXT: movq %rdx, %xmm0 -; SSE41-NEXT: cmovoq %r9, %rax -; SSE41-NEXT: movq %rax, %xmm2 +; SSE41-NEXT: movq %r9, %xmm0 +; SSE41-NEXT: cmovoq %rdx, %r8 +; SSE41-NEXT: movq %r8, %xmm2 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE41-NEXT: movdqa %xmm1, (%rdi) @@ -2868,22 +2868,22 @@ ; ; AVX-LABEL: smulo_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovq %xmm1, %r8 +; AVX-NEXT: vmovq %xmm1, %rax ; AVX-NEXT: vmovq %xmm0, %rcx ; AVX-NEXT: vpextrq $1, %xmm1, %rdx ; AVX-NEXT: vpextrq $1, %xmm0, %rsi -; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: xorl %r8d, %r8d ; AVX-NEXT: imulq %rdx, %rsi -; AVX-NEXT: movq $-1, %r9 -; AVX-NEXT: movl $0, %edx -; AVX-NEXT: cmovoq %r9, %rdx +; AVX-NEXT: movq $-1, %rdx +; AVX-NEXT: movl $0, %r9d +; AVX-NEXT: cmovoq %rdx, %r9 ; AVX-NEXT: vmovq %rsi, %xmm0 -; AVX-NEXT: imulq %r8, %rcx +; AVX-NEXT: imulq %rax, %rcx ; AVX-NEXT: vmovq %rcx, %xmm1 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX-NEXT: vmovq %rdx, %xmm0 -; AVX-NEXT: cmovoq %r9, %rax -; AVX-NEXT: vmovq %rax, %xmm2 +; AVX-NEXT: vmovq %r9, %xmm0 +; AVX-NEXT: cmovoq %rdx, %r8 +; AVX-NEXT: vmovq %r8, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: vmovdqa %xmm1, (%rdi) @@ -3297,126 +3297,125 @@ ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx ; SSE2-NEXT: movq %r8, %r14 -; SSE2-NEXT: movq %rcx, %r11 -; SSE2-NEXT: movq %rdx, %r15 -; SSE2-NEXT: movq %rsi, %r13 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: movq %rsi, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movq %r14, %rsi -; SSE2-NEXT: imulq %rcx, %rsi +; SSE2-NEXT: movq %rdx, %r8 +; SSE2-NEXT: movq %rsi, %r11 +; SSE2-NEXT: movq %rdi, %r10 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; SSE2-NEXT: movq %r11, %r12 +; SSE2-NEXT: sarq $63, %r12 +; SSE2-NEXT: movq %r14, %rbx +; SSE2-NEXT: imulq %r12, %rbx ; SSE2-NEXT: movq %r14, %rax -; SSE2-NEXT: mulq %rcx -; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: addq %rsi, %rdx -; SSE2-NEXT: imulq %r9, %rcx -; SSE2-NEXT: addq %rdx, %rcx +; SSE2-NEXT: mulq %r12 +; SSE2-NEXT: movq %rax, %rdi +; SSE2-NEXT: addq %rbx, %rdx +; SSE2-NEXT: imulq %r9, %r12 +; SSE2-NEXT: addq %rdx, %r12 ; SSE2-NEXT: movq %r9, %rbx ; SSE2-NEXT: sarq $63, %rbx -; SSE2-NEXT: movq %rbx, %rsi -; SSE2-NEXT: imulq %r13, %rsi +; SSE2-NEXT: movq %rbx, %r13 +; SSE2-NEXT: imulq %r11, %r13 ; SSE2-NEXT: movq %rbx, %rax -; SSE2-NEXT: mulq %rdi -; SSE2-NEXT: movq %rax, %r12 -; SSE2-NEXT: addq %rsi, %rdx -; SSE2-NEXT: imulq %rdi, %rbx +; SSE2-NEXT: mulq %r10 +; SSE2-NEXT: movq %rax, %r15 +; SSE2-NEXT: addq %r13, %rdx +; SSE2-NEXT: imulq %r10, %rbx ; SSE2-NEXT: addq %rdx, %rbx -; SSE2-NEXT: addq %r10, %r12 -; SSE2-NEXT: adcq %rcx, %rbx -; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: addq %rdi, %r15 +; SSE2-NEXT: adcq %r12, %rbx +; SSE2-NEXT: movq %r10, %rax ; SSE2-NEXT: mulq %r14 -; SSE2-NEXT: movq %rdx, %rbp -; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: movq %r13, %rax +; SSE2-NEXT: movq %rdx, %r12 +; SSE2-NEXT: movq %rax, %rdi +; SSE2-NEXT: movq %r11, %rax ; SSE2-NEXT: mulq %r14 -; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: addq %rbp, %rcx -; SSE2-NEXT: adcq $0, %rsi -; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: movq %rdx, %r14 +; SSE2-NEXT: movq %rax, %r13 +; SSE2-NEXT: addq %r12, %r13 +; SSE2-NEXT: adcq $0, %r14 +; SSE2-NEXT: movq %r10, %rax ; SSE2-NEXT: mulq %r9 -; SSE2-NEXT: movq %rdx, %rbp -; SSE2-NEXT: movq %rax, %rdi -; SSE2-NEXT: addq %rcx, %rdi -; SSE2-NEXT: adcq %rsi, %rbp +; SSE2-NEXT: movq %rdx, %r12 +; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: addq %r13, %r10 +; SSE2-NEXT: adcq %r14, %r12 ; SSE2-NEXT: setb %al -; SSE2-NEXT: movzbl %al, %ecx -; SSE2-NEXT: movq %r13, %rax +; SSE2-NEXT: movzbl %al, %r14d +; SSE2-NEXT: movq %r11, %rax ; SSE2-NEXT: mulq %r9 -; SSE2-NEXT: addq %rbp, %rax -; SSE2-NEXT: adcq %rcx, %rdx ; SSE2-NEXT: addq %r12, %rax +; SSE2-NEXT: adcq %r14, %rdx +; SSE2-NEXT: addq %r15, %rax ; SSE2-NEXT: adcq %rbx, %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; SSE2-NEXT: movq %rdi, 8(%r13) -; SSE2-NEXT: sarq $63, %rdi -; SSE2-NEXT: xorq %rdi, %rdx -; SSE2-NEXT: xorq %rax, %rdi -; SSE2-NEXT: xorl %r12d, %r12d -; SSE2-NEXT: orq %rdx, %rdi -; SSE2-NEXT: setne %r12b -; SSE2-NEXT: movq %r11, %rdi -; SSE2-NEXT: sarq $63, %rdi -; SSE2-NEXT: movq %r8, %rax -; SSE2-NEXT: movq %r8, %rsi -; SSE2-NEXT: imulq %rdi, %rsi -; SSE2-NEXT: movq %r8, %rbx -; SSE2-NEXT: mulq %rdi -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: addq %rsi, %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: imulq %r8, %rdi -; SSE2-NEXT: addq %rdx, %rdi -; SSE2-NEXT: movq %r8, %rsi -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: movq %rsi, %rbp -; SSE2-NEXT: imulq %r11, %rbp +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; SSE2-NEXT: movq %r10, 8(%r12) +; SSE2-NEXT: sarq $63, %r10 +; SSE2-NEXT: xorq %r10, %rdx +; SSE2-NEXT: xorq %rax, %r10 +; SSE2-NEXT: xorl %r15d, %r15d +; SSE2-NEXT: orq %rdx, %r10 +; SSE2-NEXT: setne %r15b +; SSE2-NEXT: movq %rcx, %rbx +; SSE2-NEXT: sarq $63, %rbx +; SSE2-NEXT: movq %rsi, %r10 +; SSE2-NEXT: imulq %rbx, %r10 ; SSE2-NEXT: movq %rsi, %rax -; SSE2-NEXT: mulq %r15 -; SSE2-NEXT: movq %rax, %r14 -; SSE2-NEXT: addq %rbp, %rdx -; SSE2-NEXT: imulq %r15, %rsi -; SSE2-NEXT: addq %rdx, %rsi -; SSE2-NEXT: addq %rcx, %r14 -; SSE2-NEXT: adcq %rdi, %rsi -; SSE2-NEXT: movq %r15, %rax ; SSE2-NEXT: mulq %rbx -; SSE2-NEXT: movq %rdx, %rcx ; SSE2-NEXT: movq %rax, %r9 -; SSE2-NEXT: movq %r11, %rax -; SSE2-NEXT: mulq %rbx -; SSE2-NEXT: movq %rdx, %rbx -; SSE2-NEXT: movq %rax, %rbp -; SSE2-NEXT: addq %rcx, %rbp -; SSE2-NEXT: adcq $0, %rbx -; SSE2-NEXT: movq %r15, %rax +; SSE2-NEXT: addq %r10, %rdx +; SSE2-NEXT: imulq %rbp, %rbx +; SSE2-NEXT: addq %rdx, %rbx +; SSE2-NEXT: movq %rbp, %r10 +; SSE2-NEXT: sarq $63, %r10 +; SSE2-NEXT: movq %r10, %r14 +; SSE2-NEXT: imulq %rcx, %r14 +; SSE2-NEXT: movq %r10, %rax ; SSE2-NEXT: mulq %r8 -; SSE2-NEXT: movq %rdx, %rcx -; SSE2-NEXT: movq %rax, %rdi -; SSE2-NEXT: addq %rbp, %rdi -; SSE2-NEXT: adcq %rbx, %rcx +; SSE2-NEXT: movq %rax, %r11 +; SSE2-NEXT: addq %r14, %rdx +; SSE2-NEXT: imulq %r8, %r10 +; SSE2-NEXT: addq %rdx, %r10 +; SSE2-NEXT: addq %r9, %r11 +; SSE2-NEXT: adcq %rbx, %r10 +; SSE2-NEXT: movq %r8, %rax +; SSE2-NEXT: mulq %rsi +; SSE2-NEXT: movq %rdx, %r9 +; SSE2-NEXT: movq %rax, %rbx +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: mulq %rsi +; SSE2-NEXT: movq %rdx, %rsi +; SSE2-NEXT: movq %rax, %r14 +; SSE2-NEXT: addq %r9, %r14 +; SSE2-NEXT: adcq $0, %rsi +; SSE2-NEXT: movq %r8, %rax +; SSE2-NEXT: mulq %rbp +; SSE2-NEXT: movq %rdx, %r8 +; SSE2-NEXT: movq %rax, %r9 +; SSE2-NEXT: addq %r14, %r9 +; SSE2-NEXT: adcq %rsi, %r8 ; SSE2-NEXT: setb %al -; SSE2-NEXT: movzbl %al, %ebp -; SSE2-NEXT: movq %r11, %rax -; SSE2-NEXT: mulq %r8 -; SSE2-NEXT: addq %rcx, %rax -; SSE2-NEXT: adcq %rbp, %rdx -; SSE2-NEXT: addq %r14, %rax +; SSE2-NEXT: movzbl %al, %esi +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: mulq %rbp +; SSE2-NEXT: addq %r8, %rax ; SSE2-NEXT: adcq %rsi, %rdx -; SSE2-NEXT: movq %rdi, 24(%r13) -; SSE2-NEXT: sarq $63, %rdi -; SSE2-NEXT: xorq %rdi, %rdx -; SSE2-NEXT: xorq %rax, %rdi +; SSE2-NEXT: addq %r11, %rax +; SSE2-NEXT: adcq %r10, %rdx +; SSE2-NEXT: movq %r9, 24(%r12) +; SSE2-NEXT: sarq $63, %r9 +; SSE2-NEXT: xorq %r9, %rdx +; SSE2-NEXT: xorq %rax, %r9 ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdx, %rdi +; SSE2-NEXT: orq %rdx, %r9 ; SSE2-NEXT: setne %al ; SSE2-NEXT: negl %eax ; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: negl %r12d -; SSE2-NEXT: movd %r12d, %xmm0 +; SSE2-NEXT: negl %r15d +; SSE2-NEXT: movd %r15d, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %r9, 16(%r13) -; SSE2-NEXT: movq %r10, (%r13) +; SSE2-NEXT: movq %rbx, 16(%r12) +; SSE2-NEXT: movq %rdi, (%r12) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -3434,126 +3433,125 @@ ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx ; SSSE3-NEXT: movq %r8, %r14 -; SSSE3-NEXT: movq %rcx, %r11 -; SSSE3-NEXT: movq %rdx, %r15 -; SSSE3-NEXT: movq %rsi, %r13 -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSSE3-NEXT: movq %rsi, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movq %r14, %rsi -; SSSE3-NEXT: imulq %rcx, %rsi +; SSSE3-NEXT: movq %rdx, %r8 +; SSSE3-NEXT: movq %rsi, %r11 +; SSSE3-NEXT: movq %rdi, %r10 +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; SSSE3-NEXT: movq %r11, %r12 +; SSSE3-NEXT: sarq $63, %r12 +; SSSE3-NEXT: movq %r14, %rbx +; SSSE3-NEXT: imulq %r12, %rbx ; SSSE3-NEXT: movq %r14, %rax -; SSSE3-NEXT: mulq %rcx -; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: addq %rsi, %rdx -; SSSE3-NEXT: imulq %r9, %rcx -; SSSE3-NEXT: addq %rdx, %rcx +; SSSE3-NEXT: mulq %r12 +; SSSE3-NEXT: movq %rax, %rdi +; SSSE3-NEXT: addq %rbx, %rdx +; SSSE3-NEXT: imulq %r9, %r12 +; SSSE3-NEXT: addq %rdx, %r12 ; SSSE3-NEXT: movq %r9, %rbx ; SSSE3-NEXT: sarq $63, %rbx -; SSSE3-NEXT: movq %rbx, %rsi -; SSSE3-NEXT: imulq %r13, %rsi +; SSSE3-NEXT: movq %rbx, %r13 +; SSSE3-NEXT: imulq %r11, %r13 ; SSSE3-NEXT: movq %rbx, %rax -; SSSE3-NEXT: mulq %rdi -; SSSE3-NEXT: movq %rax, %r12 -; SSSE3-NEXT: addq %rsi, %rdx -; SSSE3-NEXT: imulq %rdi, %rbx +; SSSE3-NEXT: mulq %r10 +; SSSE3-NEXT: movq %rax, %r15 +; SSSE3-NEXT: addq %r13, %rdx +; SSSE3-NEXT: imulq %r10, %rbx ; SSSE3-NEXT: addq %rdx, %rbx -; SSSE3-NEXT: addq %r10, %r12 -; SSSE3-NEXT: adcq %rcx, %rbx -; SSSE3-NEXT: movq %rdi, %rax +; SSSE3-NEXT: addq %rdi, %r15 +; SSSE3-NEXT: adcq %r12, %rbx +; SSSE3-NEXT: movq %r10, %rax ; SSSE3-NEXT: mulq %r14 -; SSSE3-NEXT: movq %rdx, %rbp -; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: movq %r13, %rax +; SSSE3-NEXT: movq %rdx, %r12 +; SSSE3-NEXT: movq %rax, %rdi +; SSSE3-NEXT: movq %r11, %rax ; SSSE3-NEXT: mulq %r14 -; SSSE3-NEXT: movq %rdx, %rsi -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: addq %rbp, %rcx -; SSSE3-NEXT: adcq $0, %rsi -; SSSE3-NEXT: movq %rdi, %rax +; SSSE3-NEXT: movq %rdx, %r14 +; SSSE3-NEXT: movq %rax, %r13 +; SSSE3-NEXT: addq %r12, %r13 +; SSSE3-NEXT: adcq $0, %r14 +; SSSE3-NEXT: movq %r10, %rax ; SSSE3-NEXT: mulq %r9 -; SSSE3-NEXT: movq %rdx, %rbp -; SSSE3-NEXT: movq %rax, %rdi -; SSSE3-NEXT: addq %rcx, %rdi -; SSSE3-NEXT: adcq %rsi, %rbp +; SSSE3-NEXT: movq %rdx, %r12 +; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: addq %r13, %r10 +; SSSE3-NEXT: adcq %r14, %r12 ; SSSE3-NEXT: setb %al -; SSSE3-NEXT: movzbl %al, %ecx -; SSSE3-NEXT: movq %r13, %rax +; SSSE3-NEXT: movzbl %al, %r14d +; SSSE3-NEXT: movq %r11, %rax ; SSSE3-NEXT: mulq %r9 -; SSSE3-NEXT: addq %rbp, %rax -; SSSE3-NEXT: adcq %rcx, %rdx ; SSSE3-NEXT: addq %r12, %rax +; SSSE3-NEXT: adcq %r14, %rdx +; SSSE3-NEXT: addq %r15, %rax ; SSSE3-NEXT: adcq %rbx, %rdx -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; SSSE3-NEXT: movq %rdi, 8(%r13) -; SSSE3-NEXT: sarq $63, %rdi -; SSSE3-NEXT: xorq %rdi, %rdx -; SSSE3-NEXT: xorq %rax, %rdi -; SSSE3-NEXT: xorl %r12d, %r12d -; SSSE3-NEXT: orq %rdx, %rdi -; SSSE3-NEXT: setne %r12b -; SSSE3-NEXT: movq %r11, %rdi -; SSSE3-NEXT: sarq $63, %rdi -; SSSE3-NEXT: movq %r8, %rax -; SSSE3-NEXT: movq %r8, %rsi -; SSSE3-NEXT: imulq %rdi, %rsi -; SSSE3-NEXT: movq %r8, %rbx -; SSSE3-NEXT: mulq %rdi -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: addq %rsi, %rdx -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSSE3-NEXT: imulq %r8, %rdi -; SSSE3-NEXT: addq %rdx, %rdi -; SSSE3-NEXT: movq %r8, %rsi -; SSSE3-NEXT: sarq $63, %rsi -; SSSE3-NEXT: movq %rsi, %rbp -; SSSE3-NEXT: imulq %r11, %rbp +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; SSSE3-NEXT: movq %r10, 8(%r12) +; SSSE3-NEXT: sarq $63, %r10 +; SSSE3-NEXT: xorq %r10, %rdx +; SSSE3-NEXT: xorq %rax, %r10 +; SSSE3-NEXT: xorl %r15d, %r15d +; SSSE3-NEXT: orq %rdx, %r10 +; SSSE3-NEXT: setne %r15b +; SSSE3-NEXT: movq %rcx, %rbx +; SSSE3-NEXT: sarq $63, %rbx +; SSSE3-NEXT: movq %rsi, %r10 +; SSSE3-NEXT: imulq %rbx, %r10 ; SSSE3-NEXT: movq %rsi, %rax -; SSSE3-NEXT: mulq %r15 -; SSSE3-NEXT: movq %rax, %r14 -; SSSE3-NEXT: addq %rbp, %rdx -; SSSE3-NEXT: imulq %r15, %rsi -; SSSE3-NEXT: addq %rdx, %rsi -; SSSE3-NEXT: addq %rcx, %r14 -; SSSE3-NEXT: adcq %rdi, %rsi -; SSSE3-NEXT: movq %r15, %rax ; SSSE3-NEXT: mulq %rbx -; SSSE3-NEXT: movq %rdx, %rcx ; SSSE3-NEXT: movq %rax, %r9 -; SSSE3-NEXT: movq %r11, %rax -; SSSE3-NEXT: mulq %rbx -; SSSE3-NEXT: movq %rdx, %rbx -; SSSE3-NEXT: movq %rax, %rbp -; SSSE3-NEXT: addq %rcx, %rbp -; SSSE3-NEXT: adcq $0, %rbx -; SSSE3-NEXT: movq %r15, %rax +; SSSE3-NEXT: addq %r10, %rdx +; SSSE3-NEXT: imulq %rbp, %rbx +; SSSE3-NEXT: addq %rdx, %rbx +; SSSE3-NEXT: movq %rbp, %r10 +; SSSE3-NEXT: sarq $63, %r10 +; SSSE3-NEXT: movq %r10, %r14 +; SSSE3-NEXT: imulq %rcx, %r14 +; SSSE3-NEXT: movq %r10, %rax ; SSSE3-NEXT: mulq %r8 -; SSSE3-NEXT: movq %rdx, %rcx -; SSSE3-NEXT: movq %rax, %rdi -; SSSE3-NEXT: addq %rbp, %rdi -; SSSE3-NEXT: adcq %rbx, %rcx +; SSSE3-NEXT: movq %rax, %r11 +; SSSE3-NEXT: addq %r14, %rdx +; SSSE3-NEXT: imulq %r8, %r10 +; SSSE3-NEXT: addq %rdx, %r10 +; SSSE3-NEXT: addq %r9, %r11 +; SSSE3-NEXT: adcq %rbx, %r10 +; SSSE3-NEXT: movq %r8, %rax +; SSSE3-NEXT: mulq %rsi +; SSSE3-NEXT: movq %rdx, %r9 +; SSSE3-NEXT: movq %rax, %rbx +; SSSE3-NEXT: movq %rcx, %rax +; SSSE3-NEXT: mulq %rsi +; SSSE3-NEXT: movq %rdx, %rsi +; SSSE3-NEXT: movq %rax, %r14 +; SSSE3-NEXT: addq %r9, %r14 +; SSSE3-NEXT: adcq $0, %rsi +; SSSE3-NEXT: movq %r8, %rax +; SSSE3-NEXT: mulq %rbp +; SSSE3-NEXT: movq %rdx, %r8 +; SSSE3-NEXT: movq %rax, %r9 +; SSSE3-NEXT: addq %r14, %r9 +; SSSE3-NEXT: adcq %rsi, %r8 ; SSSE3-NEXT: setb %al -; SSSE3-NEXT: movzbl %al, %ebp -; SSSE3-NEXT: movq %r11, %rax -; SSSE3-NEXT: mulq %r8 -; SSSE3-NEXT: addq %rcx, %rax -; SSSE3-NEXT: adcq %rbp, %rdx -; SSSE3-NEXT: addq %r14, %rax +; SSSE3-NEXT: movzbl %al, %esi +; SSSE3-NEXT: movq %rcx, %rax +; SSSE3-NEXT: mulq %rbp +; SSSE3-NEXT: addq %r8, %rax ; SSSE3-NEXT: adcq %rsi, %rdx -; SSSE3-NEXT: movq %rdi, 24(%r13) -; SSSE3-NEXT: sarq $63, %rdi -; SSSE3-NEXT: xorq %rdi, %rdx -; SSSE3-NEXT: xorq %rax, %rdi +; SSSE3-NEXT: addq %r11, %rax +; SSSE3-NEXT: adcq %r10, %rdx +; SSSE3-NEXT: movq %r9, 24(%r12) +; SSSE3-NEXT: sarq $63, %r9 +; SSSE3-NEXT: xorq %r9, %rdx +; SSSE3-NEXT: xorq %rax, %r9 ; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: orq %rdx, %rdi +; SSSE3-NEXT: orq %rdx, %r9 ; SSSE3-NEXT: setne %al ; SSSE3-NEXT: negl %eax ; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: negl %r12d -; SSSE3-NEXT: movd %r12d, %xmm0 +; SSSE3-NEXT: negl %r15d +; SSSE3-NEXT: movd %r15d, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %r9, 16(%r13) -; SSSE3-NEXT: movq %r10, (%r13) +; SSSE3-NEXT: movq %rbx, 16(%r12) +; SSSE3-NEXT: movq %rdi, (%r12) ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r13 @@ -3571,125 +3569,124 @@ ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx ; SSE41-NEXT: movq %r8, %r14 -; SSE41-NEXT: movq %rcx, %r11 -; SSE41-NEXT: movq %rdx, %r15 -; SSE41-NEXT: movq %rsi, %r13 -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE41-NEXT: movq %rsi, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: movq %r14, %rsi -; SSE41-NEXT: imulq %rcx, %rsi +; SSE41-NEXT: movq %rdx, %r8 +; SSE41-NEXT: movq %rsi, %r11 +; SSE41-NEXT: movq %rdi, %r10 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; SSE41-NEXT: movq %r11, %r12 +; SSE41-NEXT: sarq $63, %r12 +; SSE41-NEXT: movq %r14, %rbx +; SSE41-NEXT: imulq %r12, %rbx ; SSE41-NEXT: movq %r14, %rax -; SSE41-NEXT: mulq %rcx -; SSE41-NEXT: movq %rax, %r10 -; SSE41-NEXT: addq %rsi, %rdx -; SSE41-NEXT: imulq %r9, %rcx -; SSE41-NEXT: addq %rdx, %rcx +; SSE41-NEXT: mulq %r12 +; SSE41-NEXT: movq %rax, %rdi +; SSE41-NEXT: addq %rbx, %rdx +; SSE41-NEXT: imulq %r9, %r12 +; SSE41-NEXT: addq %rdx, %r12 ; SSE41-NEXT: movq %r9, %rbx ; SSE41-NEXT: sarq $63, %rbx -; SSE41-NEXT: movq %rbx, %rsi -; SSE41-NEXT: imulq %r13, %rsi +; SSE41-NEXT: movq %rbx, %r13 +; SSE41-NEXT: imulq %r11, %r13 ; SSE41-NEXT: movq %rbx, %rax -; SSE41-NEXT: mulq %rdi -; SSE41-NEXT: movq %rax, %r12 -; SSE41-NEXT: addq %rsi, %rdx -; SSE41-NEXT: imulq %rdi, %rbx +; SSE41-NEXT: mulq %r10 +; SSE41-NEXT: movq %rax, %r15 +; SSE41-NEXT: addq %r13, %rdx +; SSE41-NEXT: imulq %r10, %rbx ; SSE41-NEXT: addq %rdx, %rbx -; SSE41-NEXT: addq %r10, %r12 -; SSE41-NEXT: adcq %rcx, %rbx -; SSE41-NEXT: movq %rdi, %rax +; SSE41-NEXT: addq %rdi, %r15 +; SSE41-NEXT: adcq %r12, %rbx +; SSE41-NEXT: movq %r10, %rax ; SSE41-NEXT: mulq %r14 -; SSE41-NEXT: movq %rdx, %rbp -; SSE41-NEXT: movq %rax, %r10 -; SSE41-NEXT: movq %r13, %rax +; SSE41-NEXT: movq %rdx, %r12 +; SSE41-NEXT: movq %rax, %rdi +; SSE41-NEXT: movq %r11, %rax ; SSE41-NEXT: mulq %r14 -; SSE41-NEXT: movq %rdx, %rsi -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: addq %rbp, %rcx -; SSE41-NEXT: adcq $0, %rsi -; SSE41-NEXT: movq %rdi, %rax +; SSE41-NEXT: movq %rdx, %r14 +; SSE41-NEXT: movq %rax, %r13 +; SSE41-NEXT: addq %r12, %r13 +; SSE41-NEXT: adcq $0, %r14 +; SSE41-NEXT: movq %r10, %rax ; SSE41-NEXT: mulq %r9 -; SSE41-NEXT: movq %rdx, %rbp -; SSE41-NEXT: movq %rax, %rdi -; SSE41-NEXT: addq %rcx, %rdi -; SSE41-NEXT: adcq %rsi, %rbp +; SSE41-NEXT: movq %rdx, %r12 +; SSE41-NEXT: movq %rax, %r10 +; SSE41-NEXT: addq %r13, %r10 +; SSE41-NEXT: adcq %r14, %r12 ; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %ecx -; SSE41-NEXT: movq %r13, %rax +; SSE41-NEXT: movzbl %al, %r14d +; SSE41-NEXT: movq %r11, %rax ; SSE41-NEXT: mulq %r9 -; SSE41-NEXT: addq %rbp, %rax -; SSE41-NEXT: adcq %rcx, %rdx ; SSE41-NEXT: addq %r12, %rax +; SSE41-NEXT: adcq %r14, %rdx +; SSE41-NEXT: addq %r15, %rax ; SSE41-NEXT: adcq %rbx, %rdx -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; SSE41-NEXT: movq %rdi, 8(%r13) -; SSE41-NEXT: sarq $63, %rdi -; SSE41-NEXT: xorq %rdi, %rdx -; SSE41-NEXT: xorq %rax, %rdi -; SSE41-NEXT: xorl %r12d, %r12d -; SSE41-NEXT: orq %rdx, %rdi -; SSE41-NEXT: setne %r12b -; SSE41-NEXT: movq %r11, %rdi -; SSE41-NEXT: sarq $63, %rdi -; SSE41-NEXT: movq %r8, %rax -; SSE41-NEXT: movq %r8, %rsi -; SSE41-NEXT: imulq %rdi, %rsi -; SSE41-NEXT: movq %r8, %rbx -; SSE41-NEXT: mulq %rdi -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: addq %rsi, %rdx -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE41-NEXT: imulq %r8, %rdi -; SSE41-NEXT: addq %rdx, %rdi -; SSE41-NEXT: movq %r8, %rsi -; SSE41-NEXT: sarq $63, %rsi -; SSE41-NEXT: movq %rsi, %rbp -; SSE41-NEXT: imulq %r11, %rbp +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; SSE41-NEXT: movq %r10, 8(%r12) +; SSE41-NEXT: sarq $63, %r10 +; SSE41-NEXT: xorq %r10, %rdx +; SSE41-NEXT: xorq %rax, %r10 +; SSE41-NEXT: xorl %r15d, %r15d +; SSE41-NEXT: orq %rdx, %r10 +; SSE41-NEXT: setne %r15b +; SSE41-NEXT: movq %rcx, %rbx +; SSE41-NEXT: sarq $63, %rbx +; SSE41-NEXT: movq %rsi, %r10 +; SSE41-NEXT: imulq %rbx, %r10 ; SSE41-NEXT: movq %rsi, %rax -; SSE41-NEXT: mulq %r15 -; SSE41-NEXT: movq %rax, %r14 -; SSE41-NEXT: addq %rbp, %rdx -; SSE41-NEXT: imulq %r15, %rsi -; SSE41-NEXT: addq %rdx, %rsi -; SSE41-NEXT: addq %rcx, %r14 -; SSE41-NEXT: adcq %rdi, %rsi -; SSE41-NEXT: movq %r15, %rax ; SSE41-NEXT: mulq %rbx -; SSE41-NEXT: movq %rdx, %rcx ; SSE41-NEXT: movq %rax, %r9 -; SSE41-NEXT: movq %r11, %rax -; SSE41-NEXT: mulq %rbx -; SSE41-NEXT: movq %rdx, %rbx -; SSE41-NEXT: movq %rax, %rbp -; SSE41-NEXT: addq %rcx, %rbp -; SSE41-NEXT: adcq $0, %rbx -; SSE41-NEXT: movq %r15, %rax +; SSE41-NEXT: addq %r10, %rdx +; SSE41-NEXT: imulq %rbp, %rbx +; SSE41-NEXT: addq %rdx, %rbx +; SSE41-NEXT: movq %rbp, %r10 +; SSE41-NEXT: sarq $63, %r10 +; SSE41-NEXT: movq %r10, %r14 +; SSE41-NEXT: imulq %rcx, %r14 +; SSE41-NEXT: movq %r10, %rax ; SSE41-NEXT: mulq %r8 -; SSE41-NEXT: movq %rdx, %rcx -; SSE41-NEXT: movq %rax, %rdi -; SSE41-NEXT: addq %rbp, %rdi -; SSE41-NEXT: adcq %rbx, %rcx +; SSE41-NEXT: movq %rax, %r11 +; SSE41-NEXT: addq %r14, %rdx +; SSE41-NEXT: imulq %r8, %r10 +; SSE41-NEXT: addq %rdx, %r10 +; SSE41-NEXT: addq %r9, %r11 +; SSE41-NEXT: adcq %rbx, %r10 +; SSE41-NEXT: movq %r8, %rax +; SSE41-NEXT: mulq %rsi +; SSE41-NEXT: movq %rdx, %r9 +; SSE41-NEXT: movq %rax, %rbx +; SSE41-NEXT: movq %rcx, %rax +; SSE41-NEXT: mulq %rsi +; SSE41-NEXT: movq %rdx, %rsi +; SSE41-NEXT: movq %rax, %r14 +; SSE41-NEXT: addq %r9, %r14 +; SSE41-NEXT: adcq $0, %rsi +; SSE41-NEXT: movq %r8, %rax +; SSE41-NEXT: mulq %rbp +; SSE41-NEXT: movq %rdx, %r8 +; SSE41-NEXT: movq %rax, %r9 +; SSE41-NEXT: addq %r14, %r9 +; SSE41-NEXT: adcq %rsi, %r8 ; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %ebp -; SSE41-NEXT: movq %r11, %rax -; SSE41-NEXT: mulq %r8 -; SSE41-NEXT: addq %rcx, %rax -; SSE41-NEXT: adcq %rbp, %rdx -; SSE41-NEXT: addq %r14, %rax +; SSE41-NEXT: movzbl %al, %esi +; SSE41-NEXT: movq %rcx, %rax +; SSE41-NEXT: mulq %rbp +; SSE41-NEXT: addq %r8, %rax ; SSE41-NEXT: adcq %rsi, %rdx -; SSE41-NEXT: movq %rdi, 24(%r13) -; SSE41-NEXT: sarq $63, %rdi -; SSE41-NEXT: xorq %rdi, %rdx -; SSE41-NEXT: xorq %rax, %rdi +; SSE41-NEXT: addq %r11, %rax +; SSE41-NEXT: adcq %r10, %rdx +; SSE41-NEXT: movq %r9, 24(%r12) +; SSE41-NEXT: sarq $63, %r9 +; SSE41-NEXT: xorq %r9, %rdx +; SSE41-NEXT: xorq %rax, %r9 ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rdx, %rdi +; SSE41-NEXT: orq %rdx, %r9 ; SSE41-NEXT: setne %al ; SSE41-NEXT: negl %eax -; SSE41-NEXT: negl %r12d -; SSE41-NEXT: movd %r12d, %xmm0 +; SSE41-NEXT: negl %r15d +; SSE41-NEXT: movd %r15d, %xmm0 ; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: movq %r9, 16(%r13) -; SSE41-NEXT: movq %r10, (%r13) +; SSE41-NEXT: movq %rbx, 16(%r12) +; SSE41-NEXT: movq %rdi, (%r12) ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 ; SSE41-NEXT: popq %r13 @@ -3707,125 +3704,124 @@ ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx ; AVX-NEXT: movq %r8, %r14 -; AVX-NEXT: movq %rcx, %r11 -; AVX-NEXT: movq %rdx, %r15 -; AVX-NEXT: movq %rsi, %r13 -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX-NEXT: movq %rsi, %rcx -; AVX-NEXT: sarq $63, %rcx -; AVX-NEXT: movq %r14, %rsi -; AVX-NEXT: imulq %rcx, %rsi +; AVX-NEXT: movq %rdx, %r8 +; AVX-NEXT: movq %rsi, %r11 +; AVX-NEXT: movq %rdi, %r10 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; AVX-NEXT: movq %r11, %r12 +; AVX-NEXT: sarq $63, %r12 +; AVX-NEXT: movq %r14, %rbx +; AVX-NEXT: imulq %r12, %rbx ; AVX-NEXT: movq %r14, %rax -; AVX-NEXT: mulq %rcx -; AVX-NEXT: movq %rax, %r10 -; AVX-NEXT: addq %rsi, %rdx -; AVX-NEXT: imulq %r9, %rcx -; AVX-NEXT: addq %rdx, %rcx +; AVX-NEXT: mulq %r12 +; AVX-NEXT: movq %rax, %rdi +; AVX-NEXT: addq %rbx, %rdx +; AVX-NEXT: imulq %r9, %r12 +; AVX-NEXT: addq %rdx, %r12 ; AVX-NEXT: movq %r9, %rbx ; AVX-NEXT: sarq $63, %rbx -; AVX-NEXT: movq %rbx, %rsi -; AVX-NEXT: imulq %r13, %rsi +; AVX-NEXT: movq %rbx, %r13 +; AVX-NEXT: imulq %r11, %r13 ; AVX-NEXT: movq %rbx, %rax -; AVX-NEXT: mulq %rdi -; AVX-NEXT: movq %rax, %r12 -; AVX-NEXT: addq %rsi, %rdx -; AVX-NEXT: imulq %rdi, %rbx +; AVX-NEXT: mulq %r10 +; AVX-NEXT: movq %rax, %r15 +; AVX-NEXT: addq %r13, %rdx +; AVX-NEXT: imulq %r10, %rbx ; AVX-NEXT: addq %rdx, %rbx -; AVX-NEXT: addq %r10, %r12 -; AVX-NEXT: adcq %rcx, %rbx -; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: addq %rdi, %r15 +; AVX-NEXT: adcq %r12, %rbx +; AVX-NEXT: movq %r10, %rax ; AVX-NEXT: mulq %r14 -; AVX-NEXT: movq %rdx, %rbp -; AVX-NEXT: movq %rax, %r10 -; AVX-NEXT: movq %r13, %rax +; AVX-NEXT: movq %rdx, %r12 +; AVX-NEXT: movq %rax, %rdi +; AVX-NEXT: movq %r11, %rax ; AVX-NEXT: mulq %r14 -; AVX-NEXT: movq %rdx, %rsi -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: addq %rbp, %rcx -; AVX-NEXT: adcq $0, %rsi -; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: movq %rdx, %r14 +; AVX-NEXT: movq %rax, %r13 +; AVX-NEXT: addq %r12, %r13 +; AVX-NEXT: adcq $0, %r14 +; AVX-NEXT: movq %r10, %rax ; AVX-NEXT: mulq %r9 -; AVX-NEXT: movq %rdx, %rbp -; AVX-NEXT: movq %rax, %rdi -; AVX-NEXT: addq %rcx, %rdi -; AVX-NEXT: adcq %rsi, %rbp +; AVX-NEXT: movq %rdx, %r12 +; AVX-NEXT: movq %rax, %r10 +; AVX-NEXT: addq %r13, %r10 +; AVX-NEXT: adcq %r14, %r12 ; AVX-NEXT: setb %al -; AVX-NEXT: movzbl %al, %ecx -; AVX-NEXT: movq %r13, %rax +; AVX-NEXT: movzbl %al, %r14d +; AVX-NEXT: movq %r11, %rax ; AVX-NEXT: mulq %r9 -; AVX-NEXT: addq %rbp, %rax -; AVX-NEXT: adcq %rcx, %rdx ; AVX-NEXT: addq %r12, %rax +; AVX-NEXT: adcq %r14, %rdx +; AVX-NEXT: addq %r15, %rax ; AVX-NEXT: adcq %rbx, %rdx -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; AVX-NEXT: movq %rdi, 8(%r13) -; AVX-NEXT: sarq $63, %rdi -; AVX-NEXT: xorq %rdi, %rdx -; AVX-NEXT: xorq %rax, %rdi -; AVX-NEXT: xorl %r12d, %r12d -; AVX-NEXT: orq %rdx, %rdi -; AVX-NEXT: setne %r12b -; AVX-NEXT: movq %r11, %rdi -; AVX-NEXT: sarq $63, %rdi -; AVX-NEXT: movq %r8, %rax -; AVX-NEXT: movq %r8, %rsi -; AVX-NEXT: imulq %rdi, %rsi -; AVX-NEXT: movq %r8, %rbx -; AVX-NEXT: mulq %rdi -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: addq %rsi, %rdx -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX-NEXT: imulq %r8, %rdi -; AVX-NEXT: addq %rdx, %rdi -; AVX-NEXT: movq %r8, %rsi -; AVX-NEXT: sarq $63, %rsi -; AVX-NEXT: movq %rsi, %rbp -; AVX-NEXT: imulq %r11, %rbp +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX-NEXT: movq %r10, 8(%r12) +; AVX-NEXT: sarq $63, %r10 +; AVX-NEXT: xorq %r10, %rdx +; AVX-NEXT: xorq %rax, %r10 +; AVX-NEXT: xorl %r15d, %r15d +; AVX-NEXT: orq %rdx, %r10 +; AVX-NEXT: setne %r15b +; AVX-NEXT: movq %rcx, %rbx +; AVX-NEXT: sarq $63, %rbx +; AVX-NEXT: movq %rsi, %r10 +; AVX-NEXT: imulq %rbx, %r10 ; AVX-NEXT: movq %rsi, %rax -; AVX-NEXT: mulq %r15 -; AVX-NEXT: movq %rax, %r14 -; AVX-NEXT: addq %rbp, %rdx -; AVX-NEXT: imulq %r15, %rsi -; AVX-NEXT: addq %rdx, %rsi -; AVX-NEXT: addq %rcx, %r14 -; AVX-NEXT: adcq %rdi, %rsi -; AVX-NEXT: movq %r15, %rax ; AVX-NEXT: mulq %rbx -; AVX-NEXT: movq %rdx, %rcx ; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: movq %r11, %rax -; AVX-NEXT: mulq %rbx -; AVX-NEXT: movq %rdx, %rbx -; AVX-NEXT: movq %rax, %rbp -; AVX-NEXT: addq %rcx, %rbp -; AVX-NEXT: adcq $0, %rbx -; AVX-NEXT: movq %r15, %rax +; AVX-NEXT: addq %r10, %rdx +; AVX-NEXT: imulq %rbp, %rbx +; AVX-NEXT: addq %rdx, %rbx +; AVX-NEXT: movq %rbp, %r10 +; AVX-NEXT: sarq $63, %r10 +; AVX-NEXT: movq %r10, %r14 +; AVX-NEXT: imulq %rcx, %r14 +; AVX-NEXT: movq %r10, %rax ; AVX-NEXT: mulq %r8 -; AVX-NEXT: movq %rdx, %rcx -; AVX-NEXT: movq %rax, %rdi -; AVX-NEXT: addq %rbp, %rdi -; AVX-NEXT: adcq %rbx, %rcx +; AVX-NEXT: movq %rax, %r11 +; AVX-NEXT: addq %r14, %rdx +; AVX-NEXT: imulq %r8, %r10 +; AVX-NEXT: addq %rdx, %r10 +; AVX-NEXT: addq %r9, %r11 +; AVX-NEXT: adcq %rbx, %r10 +; AVX-NEXT: movq %r8, %rax +; AVX-NEXT: mulq %rsi +; AVX-NEXT: movq %rdx, %r9 +; AVX-NEXT: movq %rax, %rbx +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: mulq %rsi +; AVX-NEXT: movq %rdx, %rsi +; AVX-NEXT: movq %rax, %r14 +; AVX-NEXT: addq %r9, %r14 +; AVX-NEXT: adcq $0, %rsi +; AVX-NEXT: movq %r8, %rax +; AVX-NEXT: mulq %rbp +; AVX-NEXT: movq %rdx, %r8 +; AVX-NEXT: movq %rax, %r9 +; AVX-NEXT: addq %r14, %r9 +; AVX-NEXT: adcq %rsi, %r8 ; AVX-NEXT: setb %al -; AVX-NEXT: movzbl %al, %ebp -; AVX-NEXT: movq %r11, %rax -; AVX-NEXT: mulq %r8 -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: adcq %rbp, %rdx -; AVX-NEXT: addq %r14, %rax +; AVX-NEXT: movzbl %al, %esi +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: mulq %rbp +; AVX-NEXT: addq %r8, %rax ; AVX-NEXT: adcq %rsi, %rdx -; AVX-NEXT: movq %rdi, 24(%r13) -; AVX-NEXT: sarq $63, %rdi -; AVX-NEXT: xorq %rdi, %rdx -; AVX-NEXT: xorq %rax, %rdi +; AVX-NEXT: addq %r11, %rax +; AVX-NEXT: adcq %r10, %rdx +; AVX-NEXT: movq %r9, 24(%r12) +; AVX-NEXT: sarq $63, %r9 +; AVX-NEXT: xorq %r9, %rdx +; AVX-NEXT: xorq %rax, %r9 ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: orq %rdx, %rdi +; AVX-NEXT: orq %rdx, %r9 ; AVX-NEXT: setne %al ; AVX-NEXT: negl %eax -; AVX-NEXT: negl %r12d -; AVX-NEXT: vmovd %r12d, %xmm0 +; AVX-NEXT: negl %r15d +; AVX-NEXT: vmovd %r15d, %xmm0 ; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq %r9, 16(%r13) -; AVX-NEXT: movq %r10, (%r13) +; AVX-NEXT: movq %rbx, 16(%r12) +; AVX-NEXT: movq %rdi, (%r12) ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r12 ; AVX-NEXT: popq %r13 @@ -3842,119 +3838,117 @@ ; AVX512F-NEXT: pushq %r13 ; AVX512F-NEXT: pushq %r12 ; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: movq %r9, %r10 -; AVX512F-NEXT: movq %r8, %r9 -; AVX512F-NEXT: movq %rcx, %r14 -; AVX512F-NEXT: movq %rdx, %rcx -; AVX512F-NEXT: movq %rsi, %r11 -; AVX512F-NEXT: movq %rdi, %r15 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512F-NEXT: movq %r14, %rdi -; AVX512F-NEXT: sarq $63, %rdi -; AVX512F-NEXT: movq %r12, %rbx -; AVX512F-NEXT: imulq %rdi, %rbx -; AVX512F-NEXT: movq %r12, %rax -; AVX512F-NEXT: mulq %rdi -; AVX512F-NEXT: movq %rax, %rsi +; AVX512F-NEXT: movq %r9, %rbp +; AVX512F-NEXT: movq %rcx, %r11 +; AVX512F-NEXT: movq %rdx, %r10 +; AVX512F-NEXT: movq %rsi, %r9 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX512F-NEXT: movq %rcx, %r12 +; AVX512F-NEXT: sarq $63, %r12 +; AVX512F-NEXT: movq %r15, %rbx +; AVX512F-NEXT: imulq %r12, %rbx +; AVX512F-NEXT: movq %r15, %rax +; AVX512F-NEXT: mulq %r12 +; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: addq %rbx, %rdx -; AVX512F-NEXT: imulq %r8, %rdi -; AVX512F-NEXT: addq %rdx, %rdi -; AVX512F-NEXT: movq %r8, %rbx +; AVX512F-NEXT: imulq %rsi, %r12 +; AVX512F-NEXT: addq %rdx, %r12 +; AVX512F-NEXT: movq %rsi, %rbx ; AVX512F-NEXT: sarq $63, %rbx -; AVX512F-NEXT: movq %rbx, %rbp -; AVX512F-NEXT: imulq %r14, %rbp +; AVX512F-NEXT: movq %rbx, %r13 +; AVX512F-NEXT: imulq %r11, %r13 ; AVX512F-NEXT: movq %rbx, %rax -; AVX512F-NEXT: mulq %rcx -; AVX512F-NEXT: movq %rax, %r13 -; AVX512F-NEXT: addq %rbp, %rdx -; AVX512F-NEXT: imulq %rcx, %rbx +; AVX512F-NEXT: mulq %r10 +; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: addq %r13, %rdx +; AVX512F-NEXT: imulq %r10, %rbx ; AVX512F-NEXT: addq %rdx, %rbx -; AVX512F-NEXT: addq %rsi, %r13 -; AVX512F-NEXT: adcq %rdi, %rbx -; AVX512F-NEXT: movq %rcx, %rax -; AVX512F-NEXT: mulq %r12 -; AVX512F-NEXT: movq %rdx, %rbp -; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512F-NEXT: movq %r14, %rax -; AVX512F-NEXT: mulq %r12 -; AVX512F-NEXT: movq %rdx, %rdi -; AVX512F-NEXT: movq %rax, %rsi -; AVX512F-NEXT: addq %rbp, %rsi -; AVX512F-NEXT: adcq $0, %rdi -; AVX512F-NEXT: movq %rcx, %rax -; AVX512F-NEXT: mulq %r8 -; AVX512F-NEXT: movq %rdx, %rbp +; AVX512F-NEXT: addq %rcx, %r14 +; AVX512F-NEXT: adcq %r12, %rbx +; AVX512F-NEXT: movq %r10, %rax +; AVX512F-NEXT: mulq %r15 +; AVX512F-NEXT: movq %rdx, %r12 ; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: addq %rsi, %rcx -; AVX512F-NEXT: adcq %rdi, %rbp +; AVX512F-NEXT: movq %r11, %rax +; AVX512F-NEXT: mulq %r15 +; AVX512F-NEXT: movq %rdx, %r15 +; AVX512F-NEXT: movq %rax, %r13 +; AVX512F-NEXT: addq %r12, %r13 +; AVX512F-NEXT: adcq $0, %r15 +; AVX512F-NEXT: movq %r10, %rax +; AVX512F-NEXT: mulq %rsi +; AVX512F-NEXT: movq %rdx, %r12 +; AVX512F-NEXT: movq %rax, %r10 +; AVX512F-NEXT: addq %r13, %r10 +; AVX512F-NEXT: adcq %r15, %r12 ; AVX512F-NEXT: setb %al -; AVX512F-NEXT: movzbl %al, %esi -; AVX512F-NEXT: movq %r14, %rax -; AVX512F-NEXT: mulq %r8 -; AVX512F-NEXT: addq %rbp, %rax -; AVX512F-NEXT: adcq %rsi, %rdx -; AVX512F-NEXT: addq %r13, %rax +; AVX512F-NEXT: movzbl %al, %r15d +; AVX512F-NEXT: movq %r11, %rax +; AVX512F-NEXT: mulq %rsi +; AVX512F-NEXT: addq %r12, %rax +; AVX512F-NEXT: adcq %r15, %rdx +; AVX512F-NEXT: addq %r14, %rax ; AVX512F-NEXT: adcq %rbx, %rdx -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512F-NEXT: movq %rcx, 24(%r8) -; AVX512F-NEXT: sarq $63, %rcx -; AVX512F-NEXT: xorq %rcx, %rdx -; AVX512F-NEXT: xorq %rax, %rcx -; AVX512F-NEXT: orq %rdx, %rcx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX512F-NEXT: movq %r10, 24(%r12) +; AVX512F-NEXT: sarq $63, %r10 +; AVX512F-NEXT: xorq %r10, %rdx +; AVX512F-NEXT: xorq %rax, %r10 +; AVX512F-NEXT: orq %rdx, %r10 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: movq %r11, %rdi -; AVX512F-NEXT: sarq $63, %rdi ; AVX512F-NEXT: movq %r9, %rsi -; AVX512F-NEXT: imulq %rdi, %rsi -; AVX512F-NEXT: movq %r9, %rax -; AVX512F-NEXT: mulq %rdi -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: addq %rsi, %rdx -; AVX512F-NEXT: imulq %r10, %rdi -; AVX512F-NEXT: addq %rdx, %rdi -; AVX512F-NEXT: movq %r10, %rsi ; AVX512F-NEXT: sarq $63, %rsi -; AVX512F-NEXT: movq %rsi, %rbp -; AVX512F-NEXT: imulq %r11, %rbp -; AVX512F-NEXT: movq %rsi, %rax -; AVX512F-NEXT: mulq %r15 -; AVX512F-NEXT: movq %rax, %r12 -; AVX512F-NEXT: addq %rbp, %rdx -; AVX512F-NEXT: imulq %r15, %rsi +; AVX512F-NEXT: movq %r8, %r11 +; AVX512F-NEXT: imulq %rsi, %r11 +; AVX512F-NEXT: movq %r8, %rax +; AVX512F-NEXT: mulq %rsi +; AVX512F-NEXT: movq %rax, %r10 +; AVX512F-NEXT: addq %r11, %rdx +; AVX512F-NEXT: imulq %rbp, %rsi ; AVX512F-NEXT: addq %rdx, %rsi -; AVX512F-NEXT: addq %rcx, %r12 -; AVX512F-NEXT: adcq %rdi, %rsi -; AVX512F-NEXT: movq %r15, %rax -; AVX512F-NEXT: mulq %r9 -; AVX512F-NEXT: movq %rdx, %rcx -; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: movq %rbp, %r11 +; AVX512F-NEXT: sarq $63, %r11 +; AVX512F-NEXT: movq %r11, %r14 +; AVX512F-NEXT: imulq %r9, %r14 ; AVX512F-NEXT: movq %r11, %rax -; AVX512F-NEXT: mulq %r9 -; AVX512F-NEXT: movq %rdx, %rbp +; AVX512F-NEXT: mulq %rdi ; AVX512F-NEXT: movq %rax, %rbx -; AVX512F-NEXT: addq %rcx, %rbx -; AVX512F-NEXT: adcq $0, %rbp -; AVX512F-NEXT: movq %r15, %rax -; AVX512F-NEXT: mulq %r10 -; AVX512F-NEXT: movq %rdx, %rcx -; AVX512F-NEXT: movq %rax, %rdi -; AVX512F-NEXT: addq %rbx, %rdi -; AVX512F-NEXT: adcq %rbp, %rcx +; AVX512F-NEXT: addq %r14, %rdx +; AVX512F-NEXT: imulq %rdi, %r11 +; AVX512F-NEXT: addq %rdx, %r11 +; AVX512F-NEXT: addq %r10, %rbx +; AVX512F-NEXT: adcq %rsi, %r11 +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: mulq %r8 +; AVX512F-NEXT: movq %rdx, %r10 +; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: movq %r9, %rax +; AVX512F-NEXT: mulq %r8 +; AVX512F-NEXT: movq %rdx, %r8 +; AVX512F-NEXT: movq %rax, %r15 +; AVX512F-NEXT: addq %r10, %r15 +; AVX512F-NEXT: adcq $0, %r8 +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: mulq %rbp +; AVX512F-NEXT: movq %rdx, %rdi +; AVX512F-NEXT: movq %rax, %r10 +; AVX512F-NEXT: addq %r15, %r10 +; AVX512F-NEXT: adcq %r8, %rdi ; AVX512F-NEXT: setb %al -; AVX512F-NEXT: movzbl %al, %ebp -; AVX512F-NEXT: movq %r11, %rax -; AVX512F-NEXT: mulq %r10 -; AVX512F-NEXT: addq %rcx, %rax -; AVX512F-NEXT: adcq %rbp, %rdx -; AVX512F-NEXT: addq %r12, %rax +; AVX512F-NEXT: movzbl %al, %esi +; AVX512F-NEXT: movq %r9, %rax +; AVX512F-NEXT: mulq %rbp +; AVX512F-NEXT: addq %rdi, %rax ; AVX512F-NEXT: adcq %rsi, %rdx -; AVX512F-NEXT: movq %rdi, 8(%r8) -; AVX512F-NEXT: sarq $63, %rdi -; AVX512F-NEXT: xorq %rdi, %rdx -; AVX512F-NEXT: xorq %rax, %rdi -; AVX512F-NEXT: orq %rdx, %rdi +; AVX512F-NEXT: addq %rbx, %rax +; AVX512F-NEXT: adcq %r11, %rdx +; AVX512F-NEXT: movq %r10, 8(%r12) +; AVX512F-NEXT: sarq $63, %r10 +; AVX512F-NEXT: xorq %r10, %rdx +; AVX512F-NEXT: xorq %rax, %r10 +; AVX512F-NEXT: orq %rdx, %r10 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: kmovw %eax, %k1 @@ -3962,9 +3956,8 @@ ; AVX512F-NEXT: korw %k0, %k1, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512F-NEXT: movq %rax, 16(%r8) -; AVX512F-NEXT: movq %r14, (%r8) +; AVX512F-NEXT: movq %rcx, 16(%r12) +; AVX512F-NEXT: movq %r14, (%r12) ; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: popq %r12 ; AVX512F-NEXT: popq %r13 @@ -3981,119 +3974,117 @@ ; AVX512BW-NEXT: pushq %r13 ; AVX512BW-NEXT: pushq %r12 ; AVX512BW-NEXT: pushq %rbx -; AVX512BW-NEXT: movq %r9, %r10 -; AVX512BW-NEXT: movq %r8, %r9 -; AVX512BW-NEXT: movq %rcx, %r14 -; AVX512BW-NEXT: movq %rdx, %rcx -; AVX512BW-NEXT: movq %rsi, %r11 -; AVX512BW-NEXT: movq %rdi, %r15 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512BW-NEXT: movq %r14, %rdi -; AVX512BW-NEXT: sarq $63, %rdi -; AVX512BW-NEXT: movq %r12, %rbx -; AVX512BW-NEXT: imulq %rdi, %rbx -; AVX512BW-NEXT: movq %r12, %rax -; AVX512BW-NEXT: mulq %rdi -; AVX512BW-NEXT: movq %rax, %rsi +; AVX512BW-NEXT: movq %r9, %rbp +; AVX512BW-NEXT: movq %rcx, %r11 +; AVX512BW-NEXT: movq %rdx, %r10 +; AVX512BW-NEXT: movq %rsi, %r9 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX512BW-NEXT: movq %rcx, %r12 +; AVX512BW-NEXT: sarq $63, %r12 +; AVX512BW-NEXT: movq %r15, %rbx +; AVX512BW-NEXT: imulq %r12, %rbx +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: mulq %r12 +; AVX512BW-NEXT: movq %rax, %rcx ; AVX512BW-NEXT: addq %rbx, %rdx -; AVX512BW-NEXT: imulq %r8, %rdi -; AVX512BW-NEXT: addq %rdx, %rdi -; AVX512BW-NEXT: movq %r8, %rbx +; AVX512BW-NEXT: imulq %rsi, %r12 +; AVX512BW-NEXT: addq %rdx, %r12 +; AVX512BW-NEXT: movq %rsi, %rbx ; AVX512BW-NEXT: sarq $63, %rbx -; AVX512BW-NEXT: movq %rbx, %rbp -; AVX512BW-NEXT: imulq %r14, %rbp +; AVX512BW-NEXT: movq %rbx, %r13 +; AVX512BW-NEXT: imulq %r11, %r13 ; AVX512BW-NEXT: movq %rbx, %rax -; AVX512BW-NEXT: mulq %rcx -; AVX512BW-NEXT: movq %rax, %r13 -; AVX512BW-NEXT: addq %rbp, %rdx -; AVX512BW-NEXT: imulq %rcx, %rbx +; AVX512BW-NEXT: mulq %r10 +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: addq %r13, %rdx +; AVX512BW-NEXT: imulq %r10, %rbx ; AVX512BW-NEXT: addq %rdx, %rbx -; AVX512BW-NEXT: addq %rsi, %r13 -; AVX512BW-NEXT: adcq %rdi, %rbx -; AVX512BW-NEXT: movq %rcx, %rax -; AVX512BW-NEXT: mulq %r12 -; AVX512BW-NEXT: movq %rdx, %rbp -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: movq %r14, %rax -; AVX512BW-NEXT: mulq %r12 -; AVX512BW-NEXT: movq %rdx, %rdi -; AVX512BW-NEXT: movq %rax, %rsi -; AVX512BW-NEXT: addq %rbp, %rsi -; AVX512BW-NEXT: adcq $0, %rdi -; AVX512BW-NEXT: movq %rcx, %rax -; AVX512BW-NEXT: mulq %r8 -; AVX512BW-NEXT: movq %rdx, %rbp +; AVX512BW-NEXT: addq %rcx, %r14 +; AVX512BW-NEXT: adcq %r12, %rbx +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: mulq %r15 +; AVX512BW-NEXT: movq %rdx, %r12 ; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: addq %rsi, %rcx -; AVX512BW-NEXT: adcq %rdi, %rbp +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: mulq %r15 +; AVX512BW-NEXT: movq %rdx, %r15 +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: addq %r12, %r13 +; AVX512BW-NEXT: adcq $0, %r15 +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: mulq %rsi +; AVX512BW-NEXT: movq %rdx, %r12 +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: addq %r13, %r10 +; AVX512BW-NEXT: adcq %r15, %r12 ; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: movzbl %al, %esi -; AVX512BW-NEXT: movq %r14, %rax -; AVX512BW-NEXT: mulq %r8 -; AVX512BW-NEXT: addq %rbp, %rax -; AVX512BW-NEXT: adcq %rsi, %rdx -; AVX512BW-NEXT: addq %r13, %rax +; AVX512BW-NEXT: movzbl %al, %r15d +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: mulq %rsi +; AVX512BW-NEXT: addq %r12, %rax +; AVX512BW-NEXT: adcq %r15, %rdx +; AVX512BW-NEXT: addq %r14, %rax ; AVX512BW-NEXT: adcq %rbx, %rdx -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512BW-NEXT: movq %rcx, 24(%r8) -; AVX512BW-NEXT: sarq $63, %rcx -; AVX512BW-NEXT: xorq %rcx, %rdx -; AVX512BW-NEXT: xorq %rax, %rcx -; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX512BW-NEXT: movq %r10, 24(%r12) +; AVX512BW-NEXT: sarq $63, %r10 +; AVX512BW-NEXT: xorq %r10, %rdx +; AVX512BW-NEXT: xorq %rax, %r10 +; AVX512BW-NEXT: orq %rdx, %r10 ; AVX512BW-NEXT: setne %al ; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: movq %r11, %rdi -; AVX512BW-NEXT: sarq $63, %rdi ; AVX512BW-NEXT: movq %r9, %rsi -; AVX512BW-NEXT: imulq %rdi, %rsi -; AVX512BW-NEXT: movq %r9, %rax -; AVX512BW-NEXT: mulq %rdi -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: addq %rsi, %rdx -; AVX512BW-NEXT: imulq %r10, %rdi -; AVX512BW-NEXT: addq %rdx, %rdi -; AVX512BW-NEXT: movq %r10, %rsi ; AVX512BW-NEXT: sarq $63, %rsi -; AVX512BW-NEXT: movq %rsi, %rbp -; AVX512BW-NEXT: imulq %r11, %rbp -; AVX512BW-NEXT: movq %rsi, %rax -; AVX512BW-NEXT: mulq %r15 -; AVX512BW-NEXT: movq %rax, %r12 -; AVX512BW-NEXT: addq %rbp, %rdx -; AVX512BW-NEXT: imulq %r15, %rsi +; AVX512BW-NEXT: movq %r8, %r11 +; AVX512BW-NEXT: imulq %rsi, %r11 +; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: mulq %rsi +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: addq %r11, %rdx +; AVX512BW-NEXT: imulq %rbp, %rsi ; AVX512BW-NEXT: addq %rdx, %rsi -; AVX512BW-NEXT: addq %rcx, %r12 -; AVX512BW-NEXT: adcq %rdi, %rsi -; AVX512BW-NEXT: movq %r15, %rax -; AVX512BW-NEXT: mulq %r9 -; AVX512BW-NEXT: movq %rdx, %rcx -; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: movq %rbp, %r11 +; AVX512BW-NEXT: sarq $63, %r11 +; AVX512BW-NEXT: movq %r11, %r14 +; AVX512BW-NEXT: imulq %r9, %r14 ; AVX512BW-NEXT: movq %r11, %rax -; AVX512BW-NEXT: mulq %r9 -; AVX512BW-NEXT: movq %rdx, %rbp +; AVX512BW-NEXT: mulq %rdi ; AVX512BW-NEXT: movq %rax, %rbx -; AVX512BW-NEXT: addq %rcx, %rbx -; AVX512BW-NEXT: adcq $0, %rbp -; AVX512BW-NEXT: movq %r15, %rax -; AVX512BW-NEXT: mulq %r10 -; AVX512BW-NEXT: movq %rdx, %rcx -; AVX512BW-NEXT: movq %rax, %rdi -; AVX512BW-NEXT: addq %rbx, %rdi -; AVX512BW-NEXT: adcq %rbp, %rcx +; AVX512BW-NEXT: addq %r14, %rdx +; AVX512BW-NEXT: imulq %rdi, %r11 +; AVX512BW-NEXT: addq %rdx, %r11 +; AVX512BW-NEXT: addq %r10, %rbx +; AVX512BW-NEXT: adcq %rsi, %r11 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: mulq %r8 +; AVX512BW-NEXT: movq %rdx, %r10 +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: mulq %r8 +; AVX512BW-NEXT: movq %rdx, %r8 +; AVX512BW-NEXT: movq %rax, %r15 +; AVX512BW-NEXT: addq %r10, %r15 +; AVX512BW-NEXT: adcq $0, %r8 +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: mulq %rbp +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: addq %r15, %r10 +; AVX512BW-NEXT: adcq %r8, %rdi ; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: movzbl %al, %ebp -; AVX512BW-NEXT: movq %r11, %rax -; AVX512BW-NEXT: mulq %r10 -; AVX512BW-NEXT: addq %rcx, %rax -; AVX512BW-NEXT: adcq %rbp, %rdx -; AVX512BW-NEXT: addq %r12, %rax +; AVX512BW-NEXT: movzbl %al, %esi +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: mulq %rbp +; AVX512BW-NEXT: addq %rdi, %rax ; AVX512BW-NEXT: adcq %rsi, %rdx -; AVX512BW-NEXT: movq %rdi, 8(%r8) -; AVX512BW-NEXT: sarq $63, %rdi -; AVX512BW-NEXT: xorq %rdi, %rdx -; AVX512BW-NEXT: xorq %rax, %rdi -; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: addq %rbx, %rax +; AVX512BW-NEXT: adcq %r11, %rdx +; AVX512BW-NEXT: movq %r10, 8(%r12) +; AVX512BW-NEXT: sarq $63, %r10 +; AVX512BW-NEXT: xorq %r10, %rdx +; AVX512BW-NEXT: xorq %rax, %r10 +; AVX512BW-NEXT: orq %rdx, %r10 ; AVX512BW-NEXT: setne %al ; AVX512BW-NEXT: andl $1, %eax ; AVX512BW-NEXT: kmovw %eax, %k1 @@ -4101,9 +4092,8 @@ ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512BW-NEXT: movq %rax, 16(%r8) -; AVX512BW-NEXT: movq %r14, (%r8) +; AVX512BW-NEXT: movq %rcx, 16(%r12) +; AVX512BW-NEXT: movq %r14, (%r12) ; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: popq %r12 ; AVX512BW-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -460,8 +460,8 @@ ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm8 -; AVX1-NEXT: vpcmpgtd %xmm8, %xmm7, %xmm7 +; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7 ; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm7 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3 @@ -470,26 +470,26 @@ ; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 ; AVX1-NEXT: vpcmpgtd %xmm5, %xmm6, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsubd %xmm6, %xmm4, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 +; AVX1-NEXT: vpsubd %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm8, %xmm8 +; AVX1-NEXT: vpxor %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm5 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4 +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi) +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) ; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) ; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) @@ -1046,110 +1046,110 @@ define <2 x i32> @ssubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind { ; SSE2-LABEL: ssubo_v2i128: ; SSE2: # %bb.0: -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: subq %r8, %rdi ; SSE2-NEXT: sbbq %r9, %rsi ; SSE2-NEXT: seto %r8b ; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: seto %al -; SSE2-NEXT: movzbl %al, %eax -; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movzbl %r8b, %eax -; SSE2-NEXT: negl %eax -; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: seto %r9b +; SSE2-NEXT: movzbl %r9b, %r9d +; SSE2-NEXT: negl %r9d +; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: movzbl %r8b, %r8d +; SSE2-NEXT: negl %r8d +; SSE2-NEXT: movd %r8d, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %rdx, 16(%r10) -; SSE2-NEXT: movq %rdi, (%r10) -; SSE2-NEXT: movq %rcx, 24(%r10) -; SSE2-NEXT: movq %rsi, 8(%r10) +; SSE2-NEXT: movq %rdx, 16(%rax) +; SSE2-NEXT: movq %rdi, (%rax) +; SSE2-NEXT: movq %rcx, 24(%rax) +; SSE2-NEXT: movq %rsi, 8(%rax) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: ssubo_v2i128: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSSE3-NEXT: subq %r8, %rdi ; SSSE3-NEXT: sbbq %r9, %rsi ; SSSE3-NEXT: seto %r8b ; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; SSSE3-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: seto %al -; SSSE3-NEXT: movzbl %al, %eax -; SSSE3-NEXT: negl %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movzbl %r8b, %eax -; SSSE3-NEXT: negl %eax -; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: seto %r9b +; SSSE3-NEXT: movzbl %r9b, %r9d +; SSSE3-NEXT: negl %r9d +; SSSE3-NEXT: movd %r9d, %xmm1 +; SSSE3-NEXT: movzbl %r8b, %r8d +; SSSE3-NEXT: negl %r8d +; SSSE3-NEXT: movd %r8d, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %rdx, 16(%r10) -; SSSE3-NEXT: movq %rdi, (%r10) -; SSSE3-NEXT: movq %rcx, 24(%r10) -; SSSE3-NEXT: movq %rsi, 8(%r10) +; SSSE3-NEXT: movq %rdx, 16(%rax) +; SSSE3-NEXT: movq %rdi, (%rax) +; SSSE3-NEXT: movq %rcx, 24(%rax) +; SSSE3-NEXT: movq %rsi, 8(%rax) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ssubo_v2i128: ; SSE41: # %bb.0: -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE41-NEXT: subq %r8, %rdi ; SSE41-NEXT: sbbq %r9, %rsi ; SSE41-NEXT: seto %r8b ; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; SSE41-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: seto %al -; SSE41-NEXT: movzbl %al, %r9d +; SSE41-NEXT: seto %r9b +; SSE41-NEXT: movzbl %r9b, %r9d ; SSE41-NEXT: negl %r9d -; SSE41-NEXT: movzbl %r8b, %eax -; SSE41-NEXT: negl %eax -; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: movzbl %r8b, %r8d +; SSE41-NEXT: negl %r8d +; SSE41-NEXT: movd %r8d, %xmm0 ; SSE41-NEXT: pinsrd $1, %r9d, %xmm0 -; SSE41-NEXT: movq %rdx, 16(%r10) -; SSE41-NEXT: movq %rdi, (%r10) -; SSE41-NEXT: movq %rcx, 24(%r10) -; SSE41-NEXT: movq %rsi, 8(%r10) +; SSE41-NEXT: movq %rdx, 16(%rax) +; SSE41-NEXT: movq %rdi, (%rax) +; SSE41-NEXT: movq %rcx, 24(%rax) +; SSE41-NEXT: movq %rsi, 8(%rax) ; SSE41-NEXT: retq ; ; AVX-LABEL: ssubo_v2i128: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: subq %r8, %rdi ; AVX-NEXT: sbbq %r9, %rsi ; AVX-NEXT: seto %r8b ; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: seto %al -; AVX-NEXT: movzbl %al, %r9d +; AVX-NEXT: seto %r9b +; AVX-NEXT: movzbl %r9b, %r9d ; AVX-NEXT: negl %r9d -; AVX-NEXT: movzbl %r8b, %eax -; AVX-NEXT: negl %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: movzbl %r8b, %r8d +; AVX-NEXT: negl %r8d +; AVX-NEXT: vmovd %r8d, %xmm0 ; AVX-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0 -; AVX-NEXT: movq %rdx, 16(%r10) -; AVX-NEXT: movq %rdi, (%r10) -; AVX-NEXT: movq %rcx, 24(%r10) -; AVX-NEXT: movq %rsi, 8(%r10) +; AVX-NEXT: movq %rdx, 16(%rax) +; AVX-NEXT: movq %rdi, (%rax) +; AVX-NEXT: movq %rcx, 24(%rax) +; AVX-NEXT: movq %rsi, 8(%rax) ; AVX-NEXT: retq ; ; AVX512-LABEL: ssubo_v2i128: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: seto %al -; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: seto %r10b +; AVX512-NEXT: kmovd %r10d, %k0 ; AVX512-NEXT: subq %r8, %rdi ; AVX512-NEXT: sbbq %r9, %rsi -; AVX512-NEXT: seto %al -; AVX512-NEXT: andl $1, %eax -; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: seto %r8b +; AVX512-NEXT: andl $1, %r8d +; AVX512-NEXT: kmovw %r8d, %k1 ; AVX512-NEXT: kshiftlw $1, %k0, %k0 ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: movq %rdx, 16(%r10) -; AVX512-NEXT: movq %rdi, (%r10) -; AVX512-NEXT: movq %rcx, 24(%r10) -; AVX512-NEXT: movq %rsi, 8(%r10) +; AVX512-NEXT: movq %rdx, 16(%rax) +; AVX512-NEXT: movq %rdi, (%rax) +; AVX512-NEXT: movq %rcx, 24(%rax) +; AVX512-NEXT: movq %rsi, 8(%rax) ; AVX512-NEXT: retq %t = call {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -1139,102 +1139,102 @@ define <2 x i32> @uaddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind { ; SSE2-LABEL: uaddo_v2i128: ; SSE2: # %bb.0: -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE2-NEXT: xorl %r11d, %r11d +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: xorl %r10d, %r10d ; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: sbbl %eax, %eax +; SSE2-NEXT: movl $0, %r11d +; SSE2-NEXT: sbbl %r11d, %r11d ; SSE2-NEXT: addq %r8, %rdi ; SSE2-NEXT: adcq %r9, %rsi -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: sbbl %r11d, %r11d -; SSE2-NEXT: movd %r11d, %xmm0 +; SSE2-NEXT: movd %r11d, %xmm1 +; SSE2-NEXT: sbbl %r10d, %r10d +; SSE2-NEXT: movd %r10d, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %rdx, 16(%r10) -; SSE2-NEXT: movq %rdi, (%r10) -; SSE2-NEXT: movq %rcx, 24(%r10) -; SSE2-NEXT: movq %rsi, 8(%r10) +; SSE2-NEXT: movq %rdx, 16(%rax) +; SSE2-NEXT: movq %rdi, (%rax) +; SSE2-NEXT: movq %rcx, 24(%rax) +; SSE2-NEXT: movq %rsi, 8(%rax) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: uaddo_v2i128: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSSE3-NEXT: xorl %r11d, %r11d +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSSE3-NEXT: xorl %r10d, %r10d ; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; SSSE3-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: movl $0, %eax -; SSSE3-NEXT: sbbl %eax, %eax +; SSSE3-NEXT: movl $0, %r11d +; SSSE3-NEXT: sbbl %r11d, %r11d ; SSSE3-NEXT: addq %r8, %rdi ; SSSE3-NEXT: adcq %r9, %rsi -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: sbbl %r11d, %r11d -; SSSE3-NEXT: movd %r11d, %xmm0 +; SSSE3-NEXT: movd %r11d, %xmm1 +; SSSE3-NEXT: sbbl %r10d, %r10d +; SSSE3-NEXT: movd %r10d, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %rdx, 16(%r10) -; SSSE3-NEXT: movq %rdi, (%r10) -; SSSE3-NEXT: movq %rcx, 24(%r10) -; SSSE3-NEXT: movq %rsi, 8(%r10) +; SSSE3-NEXT: movq %rdx, 16(%rax) +; SSSE3-NEXT: movq %rdi, (%rax) +; SSSE3-NEXT: movq %rcx, 24(%rax) +; SSSE3-NEXT: movq %rsi, 8(%rax) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: uaddo_v2i128: ; SSE41: # %bb.0: -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE41-NEXT: xorl %r11d, %r11d +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE41-NEXT: xorl %r10d, %r10d ; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; SSE41-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: movl $0, %eax -; SSE41-NEXT: sbbl %eax, %eax +; SSE41-NEXT: movl $0, %r11d +; SSE41-NEXT: sbbl %r11d, %r11d ; SSE41-NEXT: addq %r8, %rdi ; SSE41-NEXT: adcq %r9, %rsi -; SSE41-NEXT: sbbl %r11d, %r11d -; SSE41-NEXT: movd %r11d, %xmm0 -; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: movq %rdx, 16(%r10) -; SSE41-NEXT: movq %rdi, (%r10) -; SSE41-NEXT: movq %rcx, 24(%r10) -; SSE41-NEXT: movq %rsi, 8(%r10) +; SSE41-NEXT: sbbl %r10d, %r10d +; SSE41-NEXT: movd %r10d, %xmm0 +; SSE41-NEXT: pinsrd $1, %r11d, %xmm0 +; SSE41-NEXT: movq %rdx, 16(%rax) +; SSE41-NEXT: movq %rdi, (%rax) +; SSE41-NEXT: movq %rcx, 24(%rax) +; SSE41-NEXT: movq %rsi, 8(%rax) ; SSE41-NEXT: retq ; ; AVX-LABEL: uaddo_v2i128: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX-NEXT: xorl %r11d, %r11d +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: xorl %r10d, %r10d ; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movl $0, %eax -; AVX-NEXT: sbbl %eax, %eax +; AVX-NEXT: movl $0, %r11d +; AVX-NEXT: sbbl %r11d, %r11d ; AVX-NEXT: addq %r8, %rdi ; AVX-NEXT: adcq %r9, %rsi -; AVX-NEXT: sbbl %r11d, %r11d -; AVX-NEXT: vmovd %r11d, %xmm0 -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq %rdx, 16(%r10) -; AVX-NEXT: movq %rdi, (%r10) -; AVX-NEXT: movq %rcx, 24(%r10) -; AVX-NEXT: movq %rsi, 8(%r10) +; AVX-NEXT: sbbl %r10d, %r10d +; AVX-NEXT: vmovd %r10d, %xmm0 +; AVX-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0 +; AVX-NEXT: movq %rdx, 16(%rax) +; AVX-NEXT: movq %rdi, (%rax) +; AVX-NEXT: movq %rcx, 24(%rax) +; AVX-NEXT: movq %rsi, 8(%rax) ; AVX-NEXT: retq ; ; AVX512-LABEL: uaddo_v2i128: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: setb %al -; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: setb %r10b +; AVX512-NEXT: kmovd %r10d, %k0 ; AVX512-NEXT: addq %r8, %rdi ; AVX512-NEXT: adcq %r9, %rsi -; AVX512-NEXT: setb %al -; AVX512-NEXT: andl $1, %eax -; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: setb %r8b +; AVX512-NEXT: andl $1, %r8d +; AVX512-NEXT: kmovw %r8d, %k1 ; AVX512-NEXT: kshiftlw $1, %k0, %k0 ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: movq %rdx, 16(%r10) -; AVX512-NEXT: movq %rdi, (%r10) -; AVX512-NEXT: movq %rcx, 24(%r10) -; AVX512-NEXT: movq %rsi, 8(%r10) +; AVX512-NEXT: movq %rdx, 16(%rax) +; AVX512-NEXT: movq %rdi, (%rax) +; AVX512-NEXT: movq %rcx, 24(%rax) +; AVX512-NEXT: movq %rsi, 8(%rax) ; AVX512-NEXT: retq %t = call {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -485,16 +485,16 @@ ; SSE41-NEXT: pmuludq %xmm3, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] -; SSE41-NEXT: pxor %xmm8, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE41-NEXT: pxor %xmm6, %xmm1 ; SSE41-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero -; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE41-NEXT: pmuludq %xmm7, %xmm3 +; SSE41-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero +; SSE41-NEXT: pmuludq %xmm7, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3],xmm4[4,5],xmm8[6,7] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE41-NEXT: pxor %xmm6, %xmm4 ; SSE41-NEXT: pmulld %xmm2, %xmm0 ; SSE41-NEXT: movq %xmm5, 16(%rcx) @@ -513,17 +513,17 @@ ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 ; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7] +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -587,18 +587,18 @@ ; SSE2-NEXT: pmuludq %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-NEXT: pmuludq %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE2-NEXT: pxor %xmm7, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] @@ -621,18 +621,18 @@ ; SSSE3-NEXT: pmuludq %xmm5, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 ; SSSE3-NEXT: pxor %xmm7, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm3, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 +; SSSE3-NEXT: pmuludq %xmm8, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 ; SSSE3-NEXT: pxor %xmm7, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] @@ -654,18 +654,18 @@ ; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] -; SSE41-NEXT: pxor %xmm8, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm7, %xmm7 ; SSE41-NEXT: pxor %xmm7, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE41-NEXT: pmuludq %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm5, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmuludq %xmm3, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5],xmm8[6,7] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pxor %xmm7, %xmm5 ; SSE41-NEXT: pmulld %xmm2, %xmm0 ; SSE41-NEXT: pmulld %xmm3, %xmm1 @@ -685,17 +685,17 @@ ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 ; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7] +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -747,125 +747,125 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwind { ; SSE2-LABEL: umulo_v16i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm10, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,3,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm11, %xmm11 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm12, %xmm12 +; SSE2-NEXT: pxor %xmm12, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm13, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm15 -; SSE2-NEXT: pxor %xmm11, %xmm15 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 +; SSE2-NEXT: pxor %xmm12, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm14, %xmm13 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE2-NEXT: pxor %xmm11, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm8 +; SSE2-NEXT: pxor %xmm12, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm7, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm14, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 -; SSE2-NEXT: pxor %xmm11, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 +; SSE2-NEXT: pxor %xmm12, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; SSE2-NEXT: movdqa %xmm3, 48(%rdi) ; SSE2-NEXT: movdqa %xmm2, 32(%rdi) ; SSE2-NEXT: movdqa %xmm1, 16(%rdi) -; SSE2-NEXT: movdqa %xmm8, (%rdi) -; SSE2-NEXT: movdqa %xmm15, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm9, (%rdi) +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm8, %xmm2 ; SSE2-NEXT: movdqa %xmm6, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v16i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm4, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm10, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,3,2,3] +; SSSE3-NEXT: movdqa %xmm0, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm4, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm8, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: pxor %xmm10, %xmm10 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm11 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] +; SSSE3-NEXT: pxor %xmm11, %xmm11 +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm12, %xmm12 +; SSSE3-NEXT: pxor %xmm12, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm13, %xmm12 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm15 -; SSSE3-NEXT: pxor %xmm11, %xmm15 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 +; SSSE3-NEXT: pxor %xmm12, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm6, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,3,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm14, %xmm13 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm5 -; SSSE3-NEXT: pxor %xmm11, %xmm5 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm8 +; SSSE3-NEXT: pxor %xmm12, %xmm8 ; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm7, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm14, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm6 -; SSSE3-NEXT: pxor %xmm11, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6 +; SSSE3-NEXT: pxor %xmm12, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; SSSE3-NEXT: movdqa %xmm3, 48(%rdi) ; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) ; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm8, (%rdi) -; SSSE3-NEXT: movdqa %xmm15, %xmm1 -; SSSE3-NEXT: movdqa %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, (%rdi) +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm8, %xmm2 ; SSSE3-NEXT: movdqa %xmm6, %xmm3 ; SSSE3-NEXT: retq ; @@ -925,63 +925,63 @@ ; ; AVX1-LABEL: umulo_v16i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm10 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,3,3] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[1,1,3,3] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] ; AVX1-NEXT: vpmuludq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpmuludq %xmm10, %xmm12, %xmm7 +; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7] -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpxor %xmm7, %xmm9, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5],xmm4[6,7] -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm8, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm9, %xmm4 -; AVX1-NEXT: vpackssdw %xmm7, %xmm4, %xmm11 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm13 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm5 -; AVX1-NEXT: vpackssdw %xmm13, %xmm5, %xmm5 -; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpxor %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm9, %xmm10, %xmm9 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5],xmm9[6,7] +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm9, %xmm9 +; AVX1-NEXT: vpxor %xmm8, %xmm9, %xmm9 +; AVX1-NEXT: vpackssdw %xmm7, %xmm9, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,3,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm11 +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm10, %xmm12, %xmm10 +; AVX1-NEXT: vpmuludq %xmm9, %xmm11, %xmm12 +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3],xmm12[4,5],xmm10[6,7] +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm10, %xmm10 +; AVX1-NEXT: vpxor %xmm8, %xmm10, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm12, %xmm13, %xmm12 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm13 +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3],xmm13[4,5],xmm12[6,7] +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm12, %xmm6 +; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vpackssdw %xmm10, %xmm6, %xmm6 +; AVX1-NEXT: vpacksswb %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpmulld %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpmulld %xmm9, %xmm11, %xmm8 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm6 -; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; AVX1-NEXT: vpmulld %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpacksswb %xmm11, %xmm11, %xmm1 +; AVX1-NEXT: vpacksswb %xmm7, %xmm7, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-NEXT: vmovdqa %xmm6, 48(%rdi) +; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) ; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) -; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm8, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) ; AVX1-NEXT: retq ; @@ -1252,79 +1252,79 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] ; SSE2-NEXT: pmullw %xmm4, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pand %xmm11, %xmm7 +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: pand %xmm11, %xmm8 -; SSE2-NEXT: packuswb %xmm7, %xmm8 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm7, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE2-NEXT: pmullw %xmm7, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pand %xmm11, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; SSE2-NEXT: pmullw %xmm7, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE2-NEXT: pmullw %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm11 -; SSE2-NEXT: packuswb %xmm7, %xmm11 -; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: packuswb %xmm7, %xmm4 +; SSE2-NEXT: psrlw $8, %xmm8 ; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm8, %xmm1 ; SSE2-NEXT: pcmpeqb %xmm5, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm6 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: packuswb %xmm6, %xmm0 ; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm5 -; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm9 +; SSE2-NEXT: psrad $31, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; SSE2-NEXT: psrad $24, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; SSE2-NEXT: psrad $24, %xmm4 -; SSE2-NEXT: movdqa %xmm11, 16(%rsi) -; SSE2-NEXT: movdqa %xmm8, (%rsi) -; SSE2-NEXT: movdqa %xmm4, 64(%rdi) -; SSE2-NEXT: movdqa %xmm7, (%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm8 +; SSE2-NEXT: movdqa %xmm4, 16(%rsi) +; SSE2-NEXT: movdqa %xmm2, (%rsi) +; SSE2-NEXT: movdqa %xmm8, 64(%rdi) +; SSE2-NEXT: movdqa %xmm5, (%rdi) ; SSE2-NEXT: movdqa %xmm1, 112(%rdi) -; SSE2-NEXT: movdqa %xmm2, 96(%rdi) -; SSE2-NEXT: movdqa %xmm6, 80(%rdi) +; SSE2-NEXT: movdqa %xmm9, 96(%rdi) +; SSE2-NEXT: movdqa %xmm7, 80(%rdi) ; SSE2-NEXT: movdqa %xmm3, 48(%rdi) -; SSE2-NEXT: movdqa %xmm5, 32(%rdi) +; SSE2-NEXT: movdqa %xmm6, 32(%rdi) ; SSE2-NEXT: movdqa %xmm0, 16(%rdi) ; SSE2-NEXT: retq ; @@ -1337,118 +1337,118 @@ ; SSSE3-NEXT: movdqa %xmm0, %xmm6 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] ; SSSE3-NEXT: pmullw %xmm4, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: pand %xmm11, %xmm7 +; SSSE3-NEXT: pand %xmm4, %xmm7 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSSE3-NEXT: pmullw %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm8 -; SSSE3-NEXT: pand %xmm11, %xmm8 -; SSSE3-NEXT: packuswb %xmm7, %xmm8 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: packuswb %xmm7, %xmm2 ; SSSE3-NEXT: movdqa %xmm3, %xmm7 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSSE3-NEXT: pmullw %xmm7, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm7 -; SSSE3-NEXT: pand %xmm11, %xmm7 +; SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; SSSE3-NEXT: pmullw %xmm7, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm7 +; SSSE3-NEXT: pand %xmm4, %xmm7 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSSE3-NEXT: pmullw %xmm3, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm11 -; SSSE3-NEXT: packuswb %xmm7, %xmm11 -; SSSE3-NEXT: psrlw $8, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm4 +; SSSE3-NEXT: packuswb %xmm7, %xmm4 +; SSSE3-NEXT: psrlw $8, %xmm8 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: packuswb %xmm2, %xmm1 +; SSSE3-NEXT: packuswb %xmm8, %xmm1 ; SSSE3-NEXT: pcmpeqb %xmm5, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 +; SSSE3-NEXT: pxor %xmm3, %xmm1 ; SSSE3-NEXT: psrlw $8, %xmm6 ; SSSE3-NEXT: psrlw $8, %xmm0 ; SSSE3-NEXT: packuswb %xmm6, %xmm0 ; SSSE3-NEXT: pcmpeqb %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm5 -; SSSE3-NEXT: psrad $31, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm6 +; SSSE3-NEXT: psrad $31, %xmm6 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm6 -; SSSE3-NEXT: psrad $31, %xmm6 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm7 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm7 +; SSSE3-NEXT: psrad $31, %xmm7 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm9 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm9 +; SSSE3-NEXT: psrad $31, %xmm9 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; SSSE3-NEXT: psrad $24, %xmm7 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; SSSE3-NEXT: psrad $24, %xmm4 -; SSSE3-NEXT: movdqa %xmm11, 16(%rsi) -; SSSE3-NEXT: movdqa %xmm8, (%rsi) -; SSSE3-NEXT: movdqa %xmm4, 64(%rdi) -; SSSE3-NEXT: movdqa %xmm7, (%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm8 +; SSSE3-NEXT: movdqa %xmm4, 16(%rsi) +; SSSE3-NEXT: movdqa %xmm2, (%rsi) +; SSSE3-NEXT: movdqa %xmm8, 64(%rdi) +; SSSE3-NEXT: movdqa %xmm5, (%rdi) ; SSSE3-NEXT: movdqa %xmm1, 112(%rdi) -; SSSE3-NEXT: movdqa %xmm2, 96(%rdi) -; SSSE3-NEXT: movdqa %xmm6, 80(%rdi) +; SSSE3-NEXT: movdqa %xmm9, 96(%rdi) +; SSSE3-NEXT: movdqa %xmm7, 80(%rdi) ; SSSE3-NEXT: movdqa %xmm3, 48(%rdi) -; SSSE3-NEXT: movdqa %xmm5, 32(%rdi) +; SSSE3-NEXT: movdqa %xmm6, 32(%rdi) ; SSSE3-NEXT: movdqa %xmm0, 16(%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v32i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: pxor %xmm8, %xmm8 +; SSE41-NEXT: pxor %xmm7, %xmm7 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE41-NEXT: pmullw %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pand %xmm10, %xmm6 +; SSE41-NEXT: pand %xmm2, %xmm6 ; SSE41-NEXT: pmullw %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm9 -; SSE41-NEXT: pand %xmm10, %xmm9 -; SSE41-NEXT: packuswb %xmm6, %xmm9 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pand %xmm2, %xmm5 +; SSE41-NEXT: packuswb %xmm6, %xmm5 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE41-NEXT: pmullw %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pand %xmm10, %xmm3 -; SSE41-NEXT: pmullw %xmm7, %xmm6 -; SSE41-NEXT: pand %xmm6, %xmm10 -; SSE41-NEXT: packuswb %xmm3, %xmm10 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: pmullw %xmm8, %xmm6 +; SSE41-NEXT: pand %xmm6, %xmm2 +; SSE41-NEXT: packuswb %xmm3, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm6 ; SSE41-NEXT: packuswb %xmm1, %xmm6 -; SSE41-NEXT: pcmpeqb %xmm8, %xmm6 +; SSE41-NEXT: pcmpeqb %xmm7, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm1, %xmm6 ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm4 ; SSE41-NEXT: packuswb %xmm0, %xmm4 -; SSE41-NEXT: pcmpeqb %xmm8, %xmm4 +; SSE41-NEXT: pcmpeqb %xmm7, %xmm4 ; SSE41-NEXT: pxor %xmm1, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -1466,22 +1466,22 @@ ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm7 ; SSE41-NEXT: psrad $31, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm5 -; SSE41-NEXT: psrad $31, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,3,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm8 +; SSE41-NEXT: psrad $31, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[3,3,3,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm9 +; SSE41-NEXT: psrad $31, %xmm9 ; SSE41-NEXT: pmovsxbd %xmm4, %xmm4 ; SSE41-NEXT: pmovsxbd %xmm6, %xmm6 -; SSE41-NEXT: movdqa %xmm10, 16(%rsi) -; SSE41-NEXT: movdqa %xmm9, (%rsi) +; SSE41-NEXT: movdqa %xmm2, 16(%rsi) +; SSE41-NEXT: movdqa %xmm5, (%rsi) ; SSE41-NEXT: movdqa %xmm6, 64(%rdi) ; SSE41-NEXT: movdqa %xmm4, (%rdi) -; SSE41-NEXT: movdqa %xmm2, 112(%rdi) -; SSE41-NEXT: movdqa %xmm5, 96(%rdi) +; SSE41-NEXT: movdqa %xmm9, 112(%rdi) +; SSE41-NEXT: movdqa %xmm8, 96(%rdi) ; SSE41-NEXT: movdqa %xmm7, 80(%rdi) ; SSE41-NEXT: movdqa %xmm3, 48(%rdi) ; SSE41-NEXT: movdqa %xmm1, 32(%rdi) @@ -1500,24 +1500,24 @@ ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm7 -; AVX1-NEXT: vpackuswb %xmm4, %xmm7, %xmm8 +; AVX1-NEXT: vpackuswb %xmm4, %xmm7, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm7 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX1-NEXT: vpmullw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm7, %xmm1, %xmm5 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm1 +; AVX1-NEXT: vpackuswb %xmm8, %xmm1, %xmm5 +; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm7 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm3 ; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0 @@ -1527,8 +1527,8 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm7, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] @@ -1536,13 +1536,13 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3] +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 ; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi) -; AVX1-NEXT: vmovdqa %xmm8, (%rdi) +; AVX1-NEXT: vmovdqa %xmm4, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: umulo_v32i8: @@ -1629,61 +1629,61 @@ ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; SSE2-NEXT: pmullw %xmm8, %xmm10 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pand %xmm8, %xmm12 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm8, %xmm11 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE2-NEXT: pmullw %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm8, %xmm11 -; SSE2-NEXT: packuswb %xmm12, %xmm11 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; SSE2-NEXT: pmullw %xmm4, %xmm13 -; SSE2-NEXT: movdqa %xmm13, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: packuswb %xmm11, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm12 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; SSE2-NEXT: pmullw %xmm12, %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pand %xmm8, %xmm12 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] ; SSE2-NEXT: pmullw %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm12 -; SSE2-NEXT: pand %xmm8, %xmm12 -; SSE2-NEXT: packuswb %xmm4, %xmm12 -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; SSE2-NEXT: pmullw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: packuswb %xmm12, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm13 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] +; SSE2-NEXT: movdqa %xmm2, %xmm12 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; SSE2-NEXT: pmullw %xmm13, %xmm12 +; SSE2-NEXT: movdqa %xmm12, %xmm13 +; SSE2-NEXT: pand %xmm8, %xmm13 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] ; SSE2-NEXT: pmullw %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm14 -; SSE2-NEXT: pand %xmm8, %xmm14 -; SSE2-NEXT: packuswb %xmm4, %xmm14 -; SSE2-NEXT: movdqa %xmm7, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] -; SSE2-NEXT: pmullw %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: packuswb %xmm13, %xmm6 +; SSE2-NEXT: movdqa %xmm7, %xmm13 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] +; SSE2-NEXT: movdqa %xmm3, %xmm14 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] +; SSE2-NEXT: pmullw %xmm13, %xmm14 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE2-NEXT: pmullw %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm14, %xmm7 +; SSE2-NEXT: pand %xmm8, %xmm7 ; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: packuswb %xmm4, %xmm8 -; SSE2-NEXT: psrlw $8, %xmm6 +; SSE2-NEXT: packuswb %xmm7, %xmm8 +; SSE2-NEXT: psrlw $8, %xmm14 ; SSE2-NEXT: psrlw $8, %xmm3 -; SSE2-NEXT: packuswb %xmm6, %xmm3 -; SSE2-NEXT: psrlw $8, %xmm5 +; SSE2-NEXT: packuswb %xmm14, %xmm3 +; SSE2-NEXT: psrlw $8, %xmm12 ; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm13 +; SSE2-NEXT: packuswb %xmm12, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm11 ; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm13, %xmm1 +; SSE2-NEXT: packuswb %xmm11, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm10 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: packuswb %xmm10, %xmm0 @@ -1691,42 +1691,42 @@ ; SSE2-NEXT: pcmpeqb %xmm9, %xmm2 ; SSE2-NEXT: pcmpeqb %xmm9, %xmm1 ; SSE2-NEXT: pcmpeqb %xmm9, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 +; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm7, %xmm2 +; SSE2-NEXT: pxor %xmm7, %xmm1 +; SSE2-NEXT: pxor %xmm7, %xmm0 ; SSE2-NEXT: movdqa %xmm8, 48(%rsi) -; SSE2-NEXT: movdqa %xmm14, 32(%rsi) -; SSE2-NEXT: movdqa %xmm12, 16(%rsi) -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm11, (%rsi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm5 -; SSE2-NEXT: movdqa %xmm5, 192(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm5 -; SSE2-NEXT: movdqa %xmm5, 128(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm5 -; SSE2-NEXT: movdqa %xmm5, 64(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm5 -; SSE2-NEXT: movdqa %xmm5, (%rdi) -; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm6, 32(%rsi) +; SSE2-NEXT: movdqa %xmm5, 16(%rsi) +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm4, (%rsi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 224(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 192(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 128(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 64(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm5 ; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: movdqa %xmm5, 240(%rdi) +; SSE2-NEXT: movdqa %xmm5, 224(%rdi) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 240(%rdi) ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -1792,61 +1792,61 @@ ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; SSSE3-NEXT: pmullw %xmm8, %xmm10 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSSE3-NEXT: movdqa %xmm10, %xmm12 -; SSSE3-NEXT: pand %xmm8, %xmm12 +; SSSE3-NEXT: movdqa %xmm10, %xmm11 +; SSSE3-NEXT: pand %xmm8, %xmm11 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSSE3-NEXT: pmullw %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm11 -; SSSE3-NEXT: pand %xmm8, %xmm11 -; SSSE3-NEXT: packuswb %xmm12, %xmm11 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSSE3-NEXT: movdqa %xmm1, %xmm13 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; SSSE3-NEXT: pmullw %xmm4, %xmm13 -; SSSE3-NEXT: movdqa %xmm13, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: packuswb %xmm11, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm12 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; SSSE3-NEXT: movdqa %xmm1, %xmm11 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; SSSE3-NEXT: pmullw %xmm12, %xmm11 +; SSSE3-NEXT: movdqa %xmm11, %xmm12 +; SSSE3-NEXT: pand %xmm8, %xmm12 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] ; SSSE3-NEXT: pmullw %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm12 -; SSSE3-NEXT: pand %xmm8, %xmm12 -; SSSE3-NEXT: packuswb %xmm4, %xmm12 -; SSSE3-NEXT: movdqa %xmm6, %xmm4 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; SSSE3-NEXT: pmullw %xmm4, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: packuswb %xmm12, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm13 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] +; SSSE3-NEXT: movdqa %xmm2, %xmm12 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; SSSE3-NEXT: pmullw %xmm13, %xmm12 +; SSSE3-NEXT: movdqa %xmm12, %xmm13 +; SSSE3-NEXT: pand %xmm8, %xmm13 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] ; SSSE3-NEXT: pmullw %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm14 -; SSSE3-NEXT: pand %xmm8, %xmm14 -; SSSE3-NEXT: packuswb %xmm4, %xmm14 -; SSSE3-NEXT: movdqa %xmm7, %xmm4 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] -; SSSE3-NEXT: pmullw %xmm4, %xmm6 +; SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSSE3-NEXT: pand %xmm8, %xmm6 +; SSSE3-NEXT: packuswb %xmm13, %xmm6 +; SSSE3-NEXT: movdqa %xmm7, %xmm13 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] +; SSSE3-NEXT: movdqa %xmm3, %xmm14 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] +; SSSE3-NEXT: pmullw %xmm13, %xmm14 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSSE3-NEXT: pmullw %xmm7, %xmm3 -; SSSE3-NEXT: movdqa %xmm6, %xmm4 -; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm14, %xmm7 +; SSSE3-NEXT: pand %xmm8, %xmm7 ; SSSE3-NEXT: pand %xmm3, %xmm8 -; SSSE3-NEXT: packuswb %xmm4, %xmm8 -; SSSE3-NEXT: psrlw $8, %xmm6 +; SSSE3-NEXT: packuswb %xmm7, %xmm8 +; SSSE3-NEXT: psrlw $8, %xmm14 ; SSSE3-NEXT: psrlw $8, %xmm3 -; SSSE3-NEXT: packuswb %xmm6, %xmm3 -; SSSE3-NEXT: psrlw $8, %xmm5 +; SSSE3-NEXT: packuswb %xmm14, %xmm3 +; SSSE3-NEXT: psrlw $8, %xmm12 ; SSSE3-NEXT: psrlw $8, %xmm2 -; SSSE3-NEXT: packuswb %xmm5, %xmm2 -; SSSE3-NEXT: psrlw $8, %xmm13 +; SSSE3-NEXT: packuswb %xmm12, %xmm2 +; SSSE3-NEXT: psrlw $8, %xmm11 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: packuswb %xmm13, %xmm1 +; SSSE3-NEXT: packuswb %xmm11, %xmm1 ; SSSE3-NEXT: psrlw $8, %xmm10 ; SSSE3-NEXT: psrlw $8, %xmm0 ; SSSE3-NEXT: packuswb %xmm10, %xmm0 @@ -1854,42 +1854,42 @@ ; SSSE3-NEXT: pcmpeqb %xmm9, %xmm2 ; SSSE3-NEXT: pcmpeqb %xmm9, %xmm1 ; SSSE3-NEXT: pcmpeqb %xmm9, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 +; SSSE3-NEXT: pxor %xmm7, %xmm3 +; SSSE3-NEXT: pxor %xmm7, %xmm2 +; SSSE3-NEXT: pxor %xmm7, %xmm1 +; SSSE3-NEXT: pxor %xmm7, %xmm0 ; SSSE3-NEXT: movdqa %xmm8, 48(%rsi) -; SSSE3-NEXT: movdqa %xmm14, 32(%rsi) -; SSSE3-NEXT: movdqa %xmm12, 16(%rsi) -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm11, (%rsi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: psrad $24, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, 192(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: psrad $24, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, 128(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: psrad $24, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, 64(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: psrad $24, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, (%rdi) -; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, 32(%rsi) +; SSSE3-NEXT: movdqa %xmm5, 16(%rsi) +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: movdqa %xmm4, (%rsi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 224(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psrad $24, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 192(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 128(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 64(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, (%rdi) +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm5 ; SSSE3-NEXT: psrad $31, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, 240(%rdi) +; SSSE3-NEXT: movdqa %xmm5, 224(%rdi) +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 240(%rdi) ; SSSE3-NEXT: movdqa %xmm2, %xmm4 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -2094,94 +2094,94 @@ ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm8 +; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm4 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm11 -; AVX1-NEXT: vpand %xmm6, %xmm11, %xmm4 -; AVX1-NEXT: vpackuswb %xmm8, %xmm4, %xmm8 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm8 +; AVX1-NEXT: vpackuswb %xmm4, %xmm8, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm12 -; AVX1-NEXT: vpand %xmm6, %xmm12, %xmm7 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; AVX1-NEXT: vpmullw %xmm8, %xmm10, %xmm8 +; AVX1-NEXT: vpand %xmm9, %xmm8, %xmm11 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm13 -; AVX1-NEXT: vpand %xmm6, %xmm13, %xmm2 -; AVX1-NEXT: vpackuswb %xmm7, %xmm2, %xmm10 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm10 +; AVX1-NEXT: vpand %xmm9, %xmm10, %xmm0 +; AVX1-NEXT: vpackuswb %xmm11, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; AVX1-NEXT: vpmullw %xmm2, %xmm7, %xmm7 -; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm4 -; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm14 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; AVX1-NEXT: vpmullw %xmm2, %xmm11, %xmm11 +; AVX1-NEXT: vpand %xmm9, %xmm11, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm12 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmullw %xmm12, %xmm13, %xmm12 +; AVX1-NEXT: vpand %xmm9, %xmm12, %xmm13 +; AVX1-NEXT: vpackuswb %xmm2, %xmm13, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; AVX1-NEXT: vpmullw %xmm13, %xmm14, %xmm13 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm4 -; AVX1-NEXT: vpackuswb %xmm1, %xmm4, %xmm15 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm9, %xmm13, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm9 +; AVX1-NEXT: vpackuswb %xmm1, %xmm9, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm13, %xmm9 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm12, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm13, %xmm4 -; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm11, %xmm6 -; AVX1-NEXT: vpackuswb %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm11, %xmm9 +; AVX1-NEXT: vpsrlw $8, %xmm12, %xmm11 +; AVX1-NEXT: vpackuswb %xmm9, %xmm11, %xmm9 +; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm8 +; AVX1-NEXT: vpsrlw $8, %xmm10, %xmm10 +; AVX1-NEXT: vpackuswb %xmm8, %xmm10, %xmm8 +; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 +; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 +; AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm2, %xmm6 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm5 -; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm3 -; AVX1-NEXT: vmovdqa %xmm15, 48(%rsi) -; AVX1-NEXT: vmovdqa %xmm14, 32(%rsi) -; AVX1-NEXT: vmovdqa %xmm10, 16(%rsi) -; AVX1-NEXT: vmovdqa %xmm8, (%rsi) -; AVX1-NEXT: vpmovsxbd %xmm6, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm9, %xmm9 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm8, %xmm8 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm10 +; AVX1-NEXT: vpcmpeqd %xmm11, %xmm11, %xmm11 +; AVX1-NEXT: vpxor %xmm3, %xmm11, %xmm7 +; AVX1-NEXT: vpxor %xmm11, %xmm9, %xmm6 +; AVX1-NEXT: vpxor %xmm11, %xmm8, %xmm5 +; AVX1-NEXT: vpxor %xmm11, %xmm10, %xmm3 +; AVX1-NEXT: vmovdqa %xmm1, 48(%rsi) +; AVX1-NEXT: vmovdqa %xmm2, 32(%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm4, (%rsi) +; AVX1-NEXT: vpmovsxbd %xmm7, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 192(%rdi) -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 128(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 64(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 224(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 240(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 208(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 160(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 176(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 144(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] @@ -2218,7 +2218,7 @@ ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23] ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm0 -; AVX2-NEXT: vpackuswb %ymm7, %ymm0, %ymm9 +; AVX2-NEXT: vpackuswb %ymm7, %ymm0, %ymm0 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31] ; AVX2-NEXT: vpmullw %ymm7, %ymm8, %ymm7 @@ -2227,7 +2227,7 @@ ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23] ; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm6, %ymm1, %ymm3 -; AVX2-NEXT: vpackuswb %ymm8, %ymm3, %ymm8 +; AVX2-NEXT: vpackuswb %ymm8, %ymm3, %ymm3 ; AVX2-NEXT: vpsrlw $8, %ymm7, %ymm6 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 @@ -2246,20 +2246,20 @@ ; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm9, %ymm9 ; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 ; AVX2-NEXT: vpmovsxbd %xmm5, %ymm5 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 -; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm8, 32(%rsi) -; AVX2-NEXT: vmovdqa %ymm9, (%rsi) -; AVX2-NEXT: vmovdqa %ymm0, 192(%rdi) +; AVX2-NEXT: vpmovsxbd %xmm8, %ymm8 +; AVX2-NEXT: vmovdqa %ymm3, 32(%rsi) +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %ymm8, 192(%rdi) ; AVX2-NEXT: vmovdqa %ymm1, 128(%rdi) ; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi) ; AVX2-NEXT: vmovdqa %ymm2, (%rdi) -; AVX2-NEXT: vmovdqa %ymm3, 224(%rdi) +; AVX2-NEXT: vmovdqa %ymm9, 224(%rdi) ; AVX2-NEXT: vmovdqa %ymm7, 160(%rdi) ; AVX2-NEXT: vmovdqa %ymm6, 96(%rdi) ; AVX2-NEXT: vmovdqa %ymm4, 32(%rdi) @@ -2459,24 +2459,24 @@ ; SSE2-LABEL: umulo_v2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %r8 +; SSE2-NEXT: movq %xmm2, %rcx ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %r10 +; SSE2-NEXT: movq %xmm2, %rsi ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: movq %xmm1, %rdx -; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: xorl %r8d, %r8d ; SSE2-NEXT: mulq %rdx ; SSE2-NEXT: movq $-1, %r9 -; SSE2-NEXT: movl $0, %esi -; SSE2-NEXT: cmovoq %r9, %rsi +; SSE2-NEXT: movl $0, %r10d +; SSE2-NEXT: cmovoq %r9, %r10 ; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: movq %r8, %rax -; SSE2-NEXT: mulq %r10 +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: mulq %rsi ; SSE2-NEXT: movq %rax, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movq %rsi, %xmm0 -; SSE2-NEXT: cmovoq %r9, %rcx -; SSE2-NEXT: movq %rcx, %xmm2 +; SSE2-NEXT: movq %r10, %xmm0 +; SSE2-NEXT: cmovoq %r9, %r8 +; SSE2-NEXT: movq %r8, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: movdqa %xmm1, (%rdi) @@ -2485,24 +2485,24 @@ ; SSSE3-LABEL: umulo_v2i64: ; SSSE3: # %bb.0: ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSSE3-NEXT: movq %xmm2, %r8 +; SSSE3-NEXT: movq %xmm2, %rcx ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSSE3-NEXT: movq %xmm2, %r10 +; SSSE3-NEXT: movq %xmm2, %rsi ; SSSE3-NEXT: movq %xmm0, %rax ; SSSE3-NEXT: movq %xmm1, %rdx -; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: xorl %r8d, %r8d ; SSSE3-NEXT: mulq %rdx ; SSSE3-NEXT: movq $-1, %r9 -; SSSE3-NEXT: movl $0, %esi -; SSSE3-NEXT: cmovoq %r9, %rsi +; SSSE3-NEXT: movl $0, %r10d +; SSSE3-NEXT: cmovoq %r9, %r10 ; SSSE3-NEXT: movq %rax, %xmm1 -; SSSE3-NEXT: movq %r8, %rax -; SSSE3-NEXT: mulq %r10 +; SSSE3-NEXT: movq %rcx, %rax +; SSSE3-NEXT: mulq %rsi ; SSSE3-NEXT: movq %rax, %xmm0 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSSE3-NEXT: movq %rsi, %xmm0 -; SSSE3-NEXT: cmovoq %r9, %rcx -; SSSE3-NEXT: movq %rcx, %xmm2 +; SSSE3-NEXT: movq %r10, %xmm0 +; SSSE3-NEXT: cmovoq %r9, %r8 +; SSSE3-NEXT: movq %r8, %xmm2 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: movdqa %xmm1, (%rdi) @@ -2510,23 +2510,23 @@ ; ; SSE41-LABEL: umulo_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %xmm0, %r10 -; SSE41-NEXT: movq %xmm1, %r8 +; SSE41-NEXT: movq %xmm0, %rcx +; SSE41-NEXT: movq %xmm1, %rsi ; SSE41-NEXT: pextrq $1, %xmm0, %rax ; SSE41-NEXT: pextrq $1, %xmm1, %rdx -; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: xorl %r8d, %r8d ; SSE41-NEXT: mulq %rdx ; SSE41-NEXT: movq $-1, %r9 -; SSE41-NEXT: movl $0, %ecx -; SSE41-NEXT: cmovoq %r9, %rcx +; SSE41-NEXT: movl $0, %r10d +; SSE41-NEXT: cmovoq %r9, %r10 ; SSE41-NEXT: movq %rax, %xmm0 -; SSE41-NEXT: movq %r10, %rax -; SSE41-NEXT: mulq %r8 +; SSE41-NEXT: movq %rcx, %rax +; SSE41-NEXT: mulq %rsi ; SSE41-NEXT: movq %rax, %xmm1 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE41-NEXT: movq %rcx, %xmm0 -; SSE41-NEXT: cmovoq %r9, %rsi -; SSE41-NEXT: movq %rsi, %xmm2 +; SSE41-NEXT: movq %r10, %xmm0 +; SSE41-NEXT: cmovoq %r9, %r8 +; SSE41-NEXT: movq %r8, %xmm2 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE41-NEXT: movdqa %xmm1, (%rdi) @@ -2534,23 +2534,23 @@ ; ; AVX-LABEL: umulo_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovq %xmm0, %r10 -; AVX-NEXT: vmovq %xmm1, %r8 +; AVX-NEXT: vmovq %xmm0, %rcx +; AVX-NEXT: vmovq %xmm1, %rsi ; AVX-NEXT: vpextrq $1, %xmm0, %rax ; AVX-NEXT: vpextrq $1, %xmm1, %rdx -; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: xorl %r8d, %r8d ; AVX-NEXT: mulq %rdx ; AVX-NEXT: movq $-1, %r9 -; AVX-NEXT: movl $0, %ecx -; AVX-NEXT: cmovoq %r9, %rcx +; AVX-NEXT: movl $0, %r10d +; AVX-NEXT: cmovoq %r9, %r10 ; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: movq %r10, %rax -; AVX-NEXT: mulq %r8 +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: mulq %rsi ; AVX-NEXT: vmovq %rax, %xmm1 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX-NEXT: vmovq %rcx, %xmm0 -; AVX-NEXT: cmovoq %r9, %rsi -; AVX-NEXT: vmovq %rsi, %xmm2 +; AVX-NEXT: vmovq %r10, %xmm0 +; AVX-NEXT: cmovoq %r9, %r8 +; AVX-NEXT: vmovq %r8, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: vmovdqa %xmm1, (%rdi) @@ -2907,14 +2907,13 @@ ; SSE2-NEXT: pushq %rbp ; SSE2-NEXT: pushq %r15 ; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx ; SSE2-NEXT: movq %r9, %r10 -; SSE2-NEXT: movq %rcx, %r12 -; SSE2-NEXT: movq %rdx, %r11 +; SSE2-NEXT: movq %rcx, %r11 +; SSE2-NEXT: movq %rdx, %rcx ; SSE2-NEXT: movq %rsi, %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; SSE2-NEXT: testq %r10, %r10 @@ -2924,54 +2923,53 @@ ; SSE2-NEXT: andb %dl, %bpl ; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rax, %rsi -; SSE2-NEXT: seto %bl +; SSE2-NEXT: seto %r15b ; SSE2-NEXT: movq %r10, %rax ; SSE2-NEXT: mulq %rdi -; SSE2-NEXT: seto %cl -; SSE2-NEXT: orb %bl, %cl -; SSE2-NEXT: leaq (%rsi,%rax), %rbx +; SSE2-NEXT: seto %r12b +; SSE2-NEXT: orb %r15b, %r12b +; SSE2-NEXT: leaq (%rsi,%rax), %r10 ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rax, %rdi ; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: addq %rbx, %rsi -; SSE2-NEXT: setb %r13b -; SSE2-NEXT: orb %cl, %r13b -; SSE2-NEXT: orb %bpl, %r13b +; SSE2-NEXT: addq %r10, %rsi +; SSE2-NEXT: setb %r10b +; SSE2-NEXT: orb %r12b, %r10b +; SSE2-NEXT: orb %bpl, %r10b ; SSE2-NEXT: testq %r9, %r9 ; SSE2-NEXT: setne %al -; SSE2-NEXT: testq %r12, %r12 -; SSE2-NEXT: setne %r10b -; SSE2-NEXT: andb %al, %r10b -; SSE2-NEXT: movq %r12, %rax +; SSE2-NEXT: testq %r11, %r11 +; SSE2-NEXT: setne %bpl +; SSE2-NEXT: andb %al, %bpl +; SSE2-NEXT: movq %r11, %rax ; SSE2-NEXT: mulq %r14 -; SSE2-NEXT: movq %rax, %rbp -; SSE2-NEXT: seto %r8b +; SSE2-NEXT: movq %rax, %r8 +; SSE2-NEXT: seto %r11b ; SSE2-NEXT: movq %r9, %rax -; SSE2-NEXT: mulq %r11 -; SSE2-NEXT: seto %cl -; SSE2-NEXT: orb %r8b, %cl -; SSE2-NEXT: addq %rax, %rbp -; SSE2-NEXT: movq %r11, %rax +; SSE2-NEXT: mulq %rcx +; SSE2-NEXT: seto %r9b +; SSE2-NEXT: orb %r11b, %r9b +; SSE2-NEXT: addq %rax, %r8 +; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %r14 -; SSE2-NEXT: addq %rbp, %rdx -; SSE2-NEXT: setb %bl -; SSE2-NEXT: orb %cl, %bl -; SSE2-NEXT: orb %r10b, %bl -; SSE2-NEXT: movzbl %bl, %ecx +; SSE2-NEXT: addq %r8, %rdx +; SSE2-NEXT: setb %cl +; SSE2-NEXT: orb %r9b, %cl +; SSE2-NEXT: orb %bpl, %cl +; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: negl %ecx ; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: movzbl %r13b, %ecx +; SSE2-NEXT: movzbl %r10b, %ecx ; SSE2-NEXT: negl %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %rax, 16(%r15) -; SSE2-NEXT: movq %rdi, (%r15) -; SSE2-NEXT: movq %rdx, 24(%r15) -; SSE2-NEXT: movq %rsi, 8(%r15) +; SSE2-NEXT: movq %rax, 16(%rbx) +; SSE2-NEXT: movq %rdi, (%rbx) +; SSE2-NEXT: movq %rdx, 24(%rbx) +; SSE2-NEXT: movq %rsi, 8(%rbx) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 ; SSE2-NEXT: popq %r14 ; SSE2-NEXT: popq %r15 ; SSE2-NEXT: popq %rbp @@ -2982,14 +2980,13 @@ ; SSSE3-NEXT: pushq %rbp ; SSSE3-NEXT: pushq %r15 ; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx ; SSSE3-NEXT: movq %r9, %r10 -; SSSE3-NEXT: movq %rcx, %r12 -; SSSE3-NEXT: movq %rdx, %r11 +; SSSE3-NEXT: movq %rcx, %r11 +; SSSE3-NEXT: movq %rdx, %rcx ; SSSE3-NEXT: movq %rsi, %rax -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; SSSE3-NEXT: testq %r10, %r10 @@ -2999,54 +2996,53 @@ ; SSSE3-NEXT: andb %dl, %bpl ; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rax, %rsi -; SSSE3-NEXT: seto %bl +; SSSE3-NEXT: seto %r15b ; SSSE3-NEXT: movq %r10, %rax ; SSSE3-NEXT: mulq %rdi -; SSSE3-NEXT: seto %cl -; SSSE3-NEXT: orb %bl, %cl -; SSSE3-NEXT: leaq (%rsi,%rax), %rbx +; SSSE3-NEXT: seto %r12b +; SSSE3-NEXT: orb %r15b, %r12b +; SSSE3-NEXT: leaq (%rsi,%rax), %r10 ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rax, %rdi ; SSSE3-NEXT: movq %rdx, %rsi -; SSSE3-NEXT: addq %rbx, %rsi -; SSSE3-NEXT: setb %r13b -; SSSE3-NEXT: orb %cl, %r13b -; SSSE3-NEXT: orb %bpl, %r13b +; SSSE3-NEXT: addq %r10, %rsi +; SSSE3-NEXT: setb %r10b +; SSSE3-NEXT: orb %r12b, %r10b +; SSSE3-NEXT: orb %bpl, %r10b ; SSSE3-NEXT: testq %r9, %r9 ; SSSE3-NEXT: setne %al -; SSSE3-NEXT: testq %r12, %r12 -; SSSE3-NEXT: setne %r10b -; SSSE3-NEXT: andb %al, %r10b -; SSSE3-NEXT: movq %r12, %rax +; SSSE3-NEXT: testq %r11, %r11 +; SSSE3-NEXT: setne %bpl +; SSSE3-NEXT: andb %al, %bpl +; SSSE3-NEXT: movq %r11, %rax ; SSSE3-NEXT: mulq %r14 -; SSSE3-NEXT: movq %rax, %rbp -; SSSE3-NEXT: seto %r8b +; SSSE3-NEXT: movq %rax, %r8 +; SSSE3-NEXT: seto %r11b ; SSSE3-NEXT: movq %r9, %rax -; SSSE3-NEXT: mulq %r11 -; SSSE3-NEXT: seto %cl -; SSSE3-NEXT: orb %r8b, %cl -; SSSE3-NEXT: addq %rax, %rbp -; SSSE3-NEXT: movq %r11, %rax +; SSSE3-NEXT: mulq %rcx +; SSSE3-NEXT: seto %r9b +; SSSE3-NEXT: orb %r11b, %r9b +; SSSE3-NEXT: addq %rax, %r8 +; SSSE3-NEXT: movq %rcx, %rax ; SSSE3-NEXT: mulq %r14 -; SSSE3-NEXT: addq %rbp, %rdx -; SSSE3-NEXT: setb %bl -; SSSE3-NEXT: orb %cl, %bl -; SSSE3-NEXT: orb %r10b, %bl -; SSSE3-NEXT: movzbl %bl, %ecx +; SSSE3-NEXT: addq %r8, %rdx +; SSSE3-NEXT: setb %cl +; SSSE3-NEXT: orb %r9b, %cl +; SSSE3-NEXT: orb %bpl, %cl +; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: negl %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: movzbl %r13b, %ecx +; SSSE3-NEXT: movzbl %r10b, %ecx ; SSSE3-NEXT: negl %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %rax, 16(%r15) -; SSSE3-NEXT: movq %rdi, (%r15) -; SSSE3-NEXT: movq %rdx, 24(%r15) -; SSSE3-NEXT: movq %rsi, 8(%r15) +; SSSE3-NEXT: movq %rax, 16(%rbx) +; SSSE3-NEXT: movq %rdi, (%rbx) +; SSSE3-NEXT: movq %rdx, 24(%rbx) +; SSSE3-NEXT: movq %rsi, 8(%rbx) ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 ; SSSE3-NEXT: popq %r14 ; SSSE3-NEXT: popq %r15 ; SSSE3-NEXT: popq %rbp @@ -3057,14 +3053,13 @@ ; SSE41-NEXT: pushq %rbp ; SSE41-NEXT: pushq %r15 ; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx ; SSE41-NEXT: movq %r9, %r10 -; SSE41-NEXT: movq %rcx, %r12 -; SSE41-NEXT: movq %rdx, %r11 +; SSE41-NEXT: movq %rcx, %r11 +; SSE41-NEXT: movq %rdx, %rcx ; SSE41-NEXT: movq %rsi, %rax -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; SSE41-NEXT: testq %r10, %r10 @@ -3074,53 +3069,52 @@ ; SSE41-NEXT: andb %dl, %bpl ; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rax, %rsi -; SSE41-NEXT: seto %bl +; SSE41-NEXT: seto %r15b ; SSE41-NEXT: movq %r10, %rax ; SSE41-NEXT: mulq %rdi -; SSE41-NEXT: seto %cl -; SSE41-NEXT: orb %bl, %cl -; SSE41-NEXT: leaq (%rsi,%rax), %rbx +; SSE41-NEXT: seto %r12b +; SSE41-NEXT: orb %r15b, %r12b +; SSE41-NEXT: leaq (%rsi,%rax), %r10 ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rax, %rdi ; SSE41-NEXT: movq %rdx, %rsi -; SSE41-NEXT: addq %rbx, %rsi -; SSE41-NEXT: setb %r13b -; SSE41-NEXT: orb %cl, %r13b -; SSE41-NEXT: orb %bpl, %r13b +; SSE41-NEXT: addq %r10, %rsi +; SSE41-NEXT: setb %r10b +; SSE41-NEXT: orb %r12b, %r10b +; SSE41-NEXT: orb %bpl, %r10b ; SSE41-NEXT: testq %r9, %r9 ; SSE41-NEXT: setne %al -; SSE41-NEXT: testq %r12, %r12 -; SSE41-NEXT: setne %r10b -; SSE41-NEXT: andb %al, %r10b -; SSE41-NEXT: movq %r12, %rax +; SSE41-NEXT: testq %r11, %r11 +; SSE41-NEXT: setne %bpl +; SSE41-NEXT: andb %al, %bpl +; SSE41-NEXT: movq %r11, %rax ; SSE41-NEXT: mulq %r14 -; SSE41-NEXT: movq %rax, %rbp -; SSE41-NEXT: seto %r8b +; SSE41-NEXT: movq %rax, %r8 +; SSE41-NEXT: seto %r11b ; SSE41-NEXT: movq %r9, %rax -; SSE41-NEXT: mulq %r11 -; SSE41-NEXT: seto %cl -; SSE41-NEXT: orb %r8b, %cl -; SSE41-NEXT: addq %rax, %rbp -; SSE41-NEXT: movq %r11, %rax +; SSE41-NEXT: mulq %rcx +; SSE41-NEXT: seto %r9b +; SSE41-NEXT: orb %r11b, %r9b +; SSE41-NEXT: addq %rax, %r8 +; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: mulq %r14 -; SSE41-NEXT: addq %rbp, %rdx -; SSE41-NEXT: setb %bl -; SSE41-NEXT: orb %cl, %bl -; SSE41-NEXT: orb %r10b, %bl -; SSE41-NEXT: movzbl %bl, %ecx +; SSE41-NEXT: addq %r8, %rdx +; SSE41-NEXT: setb %cl +; SSE41-NEXT: orb %r9b, %cl +; SSE41-NEXT: orb %bpl, %cl +; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: negl %ecx -; SSE41-NEXT: movzbl %r13b, %ebp -; SSE41-NEXT: negl %ebp -; SSE41-NEXT: movd %ebp, %xmm0 +; SSE41-NEXT: movzbl %r10b, %r8d +; SSE41-NEXT: negl %r8d +; SSE41-NEXT: movd %r8d, %xmm0 ; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, 16(%r15) -; SSE41-NEXT: movq %rdi, (%r15) -; SSE41-NEXT: movq %rdx, 24(%r15) -; SSE41-NEXT: movq %rsi, 8(%r15) +; SSE41-NEXT: movq %rax, 16(%rbx) +; SSE41-NEXT: movq %rdi, (%rbx) +; SSE41-NEXT: movq %rdx, 24(%rbx) +; SSE41-NEXT: movq %rsi, 8(%rbx) ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 ; SSE41-NEXT: popq %r14 ; SSE41-NEXT: popq %r15 ; SSE41-NEXT: popq %rbp @@ -3131,14 +3125,13 @@ ; AVX-NEXT: pushq %rbp ; AVX-NEXT: pushq %r15 ; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %r13 ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx ; AVX-NEXT: movq %r9, %r10 -; AVX-NEXT: movq %rcx, %r12 -; AVX-NEXT: movq %rdx, %r11 +; AVX-NEXT: movq %rcx, %r11 +; AVX-NEXT: movq %rdx, %rcx ; AVX-NEXT: movq %rsi, %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; AVX-NEXT: testq %r10, %r10 @@ -3148,53 +3141,52 @@ ; AVX-NEXT: andb %dl, %bpl ; AVX-NEXT: mulq %r8 ; AVX-NEXT: movq %rax, %rsi -; AVX-NEXT: seto %bl +; AVX-NEXT: seto %r15b ; AVX-NEXT: movq %r10, %rax ; AVX-NEXT: mulq %rdi -; AVX-NEXT: seto %cl -; AVX-NEXT: orb %bl, %cl -; AVX-NEXT: leaq (%rsi,%rax), %rbx +; AVX-NEXT: seto %r12b +; AVX-NEXT: orb %r15b, %r12b +; AVX-NEXT: leaq (%rsi,%rax), %r10 ; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: mulq %r8 ; AVX-NEXT: movq %rax, %rdi ; AVX-NEXT: movq %rdx, %rsi -; AVX-NEXT: addq %rbx, %rsi -; AVX-NEXT: setb %r13b -; AVX-NEXT: orb %cl, %r13b -; AVX-NEXT: orb %bpl, %r13b +; AVX-NEXT: addq %r10, %rsi +; AVX-NEXT: setb %r10b +; AVX-NEXT: orb %r12b, %r10b +; AVX-NEXT: orb %bpl, %r10b ; AVX-NEXT: testq %r9, %r9 ; AVX-NEXT: setne %al -; AVX-NEXT: testq %r12, %r12 -; AVX-NEXT: setne %r10b -; AVX-NEXT: andb %al, %r10b -; AVX-NEXT: movq %r12, %rax +; AVX-NEXT: testq %r11, %r11 +; AVX-NEXT: setne %bpl +; AVX-NEXT: andb %al, %bpl +; AVX-NEXT: movq %r11, %rax ; AVX-NEXT: mulq %r14 -; AVX-NEXT: movq %rax, %rbp -; AVX-NEXT: seto %r8b +; AVX-NEXT: movq %rax, %r8 +; AVX-NEXT: seto %r11b ; AVX-NEXT: movq %r9, %rax -; AVX-NEXT: mulq %r11 -; AVX-NEXT: seto %cl -; AVX-NEXT: orb %r8b, %cl -; AVX-NEXT: addq %rax, %rbp -; AVX-NEXT: movq %r11, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: seto %r9b +; AVX-NEXT: orb %r11b, %r9b +; AVX-NEXT: addq %rax, %r8 +; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %r14 -; AVX-NEXT: addq %rbp, %rdx -; AVX-NEXT: setb %bl -; AVX-NEXT: orb %cl, %bl -; AVX-NEXT: orb %r10b, %bl -; AVX-NEXT: movzbl %bl, %ecx +; AVX-NEXT: addq %r8, %rdx +; AVX-NEXT: setb %cl +; AVX-NEXT: orb %r9b, %cl +; AVX-NEXT: orb %bpl, %cl +; AVX-NEXT: movzbl %cl, %ecx ; AVX-NEXT: negl %ecx -; AVX-NEXT: movzbl %r13b, %ebp -; AVX-NEXT: negl %ebp -; AVX-NEXT: vmovd %ebp, %xmm0 +; AVX-NEXT: movzbl %r10b, %r8d +; AVX-NEXT: negl %r8d +; AVX-NEXT: vmovd %r8d, %xmm0 ; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq %rax, 16(%r15) -; AVX-NEXT: movq %rdi, (%r15) -; AVX-NEXT: movq %rdx, 24(%r15) -; AVX-NEXT: movq %rsi, 8(%r15) +; AVX-NEXT: movq %rax, 16(%rbx) +; AVX-NEXT: movq %rdi, (%rbx) +; AVX-NEXT: movq %rdx, 24(%rbx) +; AVX-NEXT: movq %rsi, 8(%rbx) ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r12 -; AVX-NEXT: popq %r13 ; AVX-NEXT: popq %r14 ; AVX-NEXT: popq %r15 ; AVX-NEXT: popq %rbp @@ -3208,63 +3200,63 @@ ; AVX512F-NEXT: pushq %r12 ; AVX512F-NEXT: pushq %rbx ; AVX512F-NEXT: movq %rcx, %rax -; AVX512F-NEXT: movq %rdx, %r12 -; AVX512F-NEXT: movq %rdi, %r11 +; AVX512F-NEXT: movq %rdx, %rcx +; AVX512F-NEXT: movq %rsi, %r10 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: testq %r10, %r10 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX512F-NEXT: testq %rsi, %rsi ; AVX512F-NEXT: setne %dl -; AVX512F-NEXT: testq %rcx, %rcx -; AVX512F-NEXT: setne %bl -; AVX512F-NEXT: andb %dl, %bl -; AVX512F-NEXT: mulq %r15 -; AVX512F-NEXT: movq %rax, %rdi -; AVX512F-NEXT: seto %bpl -; AVX512F-NEXT: movq %r10, %rax -; AVX512F-NEXT: mulq %r12 -; AVX512F-NEXT: seto %cl -; AVX512F-NEXT: orb %bpl, %cl -; AVX512F-NEXT: leaq (%rdi,%rax), %rbp -; AVX512F-NEXT: movq %r12, %rax -; AVX512F-NEXT: mulq %r15 -; AVX512F-NEXT: movq %rax, %r10 -; AVX512F-NEXT: movq %rdx, %rdi -; AVX512F-NEXT: addq %rbp, %rdi +; AVX512F-NEXT: testq %rax, %rax +; AVX512F-NEXT: setne %bpl +; AVX512F-NEXT: andb %dl, %bpl +; AVX512F-NEXT: mulq %r14 +; AVX512F-NEXT: movq %rax, %r11 +; AVX512F-NEXT: seto %r15b +; AVX512F-NEXT: movq %rsi, %rax +; AVX512F-NEXT: mulq %rcx +; AVX512F-NEXT: seto %r12b +; AVX512F-NEXT: orb %r15b, %r12b +; AVX512F-NEXT: addq %rax, %r11 +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: mulq %r14 +; AVX512F-NEXT: movq %rax, %rsi +; AVX512F-NEXT: movq %rdx, %rcx +; AVX512F-NEXT: addq %r11, %rcx ; AVX512F-NEXT: setb %al -; AVX512F-NEXT: orb %cl, %al -; AVX512F-NEXT: orb %bl, %al +; AVX512F-NEXT: orb %r12b, %al +; AVX512F-NEXT: orb %bpl, %al ; AVX512F-NEXT: kmovw %eax, %k0 ; AVX512F-NEXT: testq %r9, %r9 ; AVX512F-NEXT: setne %al -; AVX512F-NEXT: testq %rsi, %rsi -; AVX512F-NEXT: setne %cl -; AVX512F-NEXT: andb %al, %cl -; AVX512F-NEXT: movq %rsi, %rax +; AVX512F-NEXT: testq %r10, %r10 +; AVX512F-NEXT: setne %r11b +; AVX512F-NEXT: andb %al, %r11b +; AVX512F-NEXT: movq %r10, %rax ; AVX512F-NEXT: mulq %r8 -; AVX512F-NEXT: movq %rax, %rsi +; AVX512F-NEXT: movq %rax, %r10 ; AVX512F-NEXT: seto %bpl ; AVX512F-NEXT: movq %r9, %rax -; AVX512F-NEXT: mulq %r11 -; AVX512F-NEXT: seto %bl -; AVX512F-NEXT: orb %bpl, %bl -; AVX512F-NEXT: addq %rax, %rsi -; AVX512F-NEXT: movq %r11, %rax +; AVX512F-NEXT: mulq %rdi +; AVX512F-NEXT: seto %r9b +; AVX512F-NEXT: orb %bpl, %r9b +; AVX512F-NEXT: addq %rax, %r10 +; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: mulq %r8 -; AVX512F-NEXT: addq %rsi, %rdx -; AVX512F-NEXT: setb %sil -; AVX512F-NEXT: orb %bl, %sil -; AVX512F-NEXT: orb %cl, %sil -; AVX512F-NEXT: andl $1, %esi -; AVX512F-NEXT: kmovw %esi, %k1 +; AVX512F-NEXT: addq %r10, %rdx +; AVX512F-NEXT: setb %dil +; AVX512F-NEXT: orb %r9b, %dil +; AVX512F-NEXT: orb %r11b, %dil +; AVX512F-NEXT: andl $1, %edi +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: kshiftlw $1, %k0, %k0 ; AVX512F-NEXT: korw %k0, %k1, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512F-NEXT: movq %r10, 16(%r14) -; AVX512F-NEXT: movq %rax, (%r14) -; AVX512F-NEXT: movq %rdi, 24(%r14) -; AVX512F-NEXT: movq %rdx, 8(%r14) +; AVX512F-NEXT: movq %rsi, 16(%rbx) +; AVX512F-NEXT: movq %rax, (%rbx) +; AVX512F-NEXT: movq %rcx, 24(%rbx) +; AVX512F-NEXT: movq %rdx, 8(%rbx) ; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: popq %r12 ; AVX512F-NEXT: popq %r14 @@ -3280,63 +3272,63 @@ ; AVX512BW-NEXT: pushq %r12 ; AVX512BW-NEXT: pushq %rbx ; AVX512BW-NEXT: movq %rcx, %rax -; AVX512BW-NEXT: movq %rdx, %r12 -; AVX512BW-NEXT: movq %rdi, %r11 +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: movq %rsi, %r10 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: testq %r10, %r10 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX512BW-NEXT: testq %rsi, %rsi ; AVX512BW-NEXT: setne %dl -; AVX512BW-NEXT: testq %rcx, %rcx -; AVX512BW-NEXT: setne %bl -; AVX512BW-NEXT: andb %dl, %bl -; AVX512BW-NEXT: mulq %r15 -; AVX512BW-NEXT: movq %rax, %rdi -; AVX512BW-NEXT: seto %bpl -; AVX512BW-NEXT: movq %r10, %rax -; AVX512BW-NEXT: mulq %r12 -; AVX512BW-NEXT: seto %cl -; AVX512BW-NEXT: orb %bpl, %cl -; AVX512BW-NEXT: leaq (%rdi,%rax), %rbp -; AVX512BW-NEXT: movq %r12, %rax -; AVX512BW-NEXT: mulq %r15 -; AVX512BW-NEXT: movq %rax, %r10 -; AVX512BW-NEXT: movq %rdx, %rdi -; AVX512BW-NEXT: addq %rbp, %rdi +; AVX512BW-NEXT: testq %rax, %rax +; AVX512BW-NEXT: setne %bpl +; AVX512BW-NEXT: andb %dl, %bpl +; AVX512BW-NEXT: mulq %r14 +; AVX512BW-NEXT: movq %rax, %r11 +; AVX512BW-NEXT: seto %r15b +; AVX512BW-NEXT: movq %rsi, %rax +; AVX512BW-NEXT: mulq %rcx +; AVX512BW-NEXT: seto %r12b +; AVX512BW-NEXT: orb %r15b, %r12b +; AVX512BW-NEXT: addq %rax, %r11 +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: mulq %r14 +; AVX512BW-NEXT: movq %rax, %rsi +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: addq %r11, %rcx ; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: orb %cl, %al -; AVX512BW-NEXT: orb %bl, %al +; AVX512BW-NEXT: orb %r12b, %al +; AVX512BW-NEXT: orb %bpl, %al ; AVX512BW-NEXT: kmovd %eax, %k0 ; AVX512BW-NEXT: testq %r9, %r9 ; AVX512BW-NEXT: setne %al -; AVX512BW-NEXT: testq %rsi, %rsi -; AVX512BW-NEXT: setne %cl -; AVX512BW-NEXT: andb %al, %cl -; AVX512BW-NEXT: movq %rsi, %rax +; AVX512BW-NEXT: testq %r10, %r10 +; AVX512BW-NEXT: setne %r11b +; AVX512BW-NEXT: andb %al, %r11b +; AVX512BW-NEXT: movq %r10, %rax ; AVX512BW-NEXT: mulq %r8 -; AVX512BW-NEXT: movq %rax, %rsi +; AVX512BW-NEXT: movq %rax, %r10 ; AVX512BW-NEXT: seto %bpl ; AVX512BW-NEXT: movq %r9, %rax -; AVX512BW-NEXT: mulq %r11 -; AVX512BW-NEXT: seto %bl -; AVX512BW-NEXT: orb %bpl, %bl -; AVX512BW-NEXT: addq %rax, %rsi -; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: mulq %rdi +; AVX512BW-NEXT: seto %r9b +; AVX512BW-NEXT: orb %bpl, %r9b +; AVX512BW-NEXT: addq %rax, %r10 +; AVX512BW-NEXT: movq %rdi, %rax ; AVX512BW-NEXT: mulq %r8 -; AVX512BW-NEXT: addq %rsi, %rdx -; AVX512BW-NEXT: setb %sil -; AVX512BW-NEXT: orb %bl, %sil -; AVX512BW-NEXT: orb %cl, %sil -; AVX512BW-NEXT: andl $1, %esi -; AVX512BW-NEXT: kmovw %esi, %k1 +; AVX512BW-NEXT: addq %r10, %rdx +; AVX512BW-NEXT: setb %dil +; AVX512BW-NEXT: orb %r9b, %dil +; AVX512BW-NEXT: orb %r11b, %dil +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: kmovw %edi, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512BW-NEXT: movq %r10, 16(%r14) -; AVX512BW-NEXT: movq %rax, (%r14) -; AVX512BW-NEXT: movq %rdi, 24(%r14) -; AVX512BW-NEXT: movq %rdx, 8(%r14) +; AVX512BW-NEXT: movq %rsi, 16(%rbx) +; AVX512BW-NEXT: movq %rax, (%rbx) +; AVX512BW-NEXT: movq %rcx, 24(%rbx) +; AVX512BW-NEXT: movq %rdx, 8(%rbx) ; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: popq %r12 ; AVX512BW-NEXT: popq %r14 diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -561,26 +561,26 @@ ; SSE41-NEXT: psubd %xmm4, %xmm8 ; SSE41-NEXT: pminud %xmm8, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: psubd %xmm5, %xmm4 -; SSE41-NEXT: pminud %xmm4, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm1 -; SSE41-NEXT: pxor %xmm9, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm9 +; SSE41-NEXT: psubd %xmm5, %xmm9 +; SSE41-NEXT: pminud %xmm9, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm5 ; SSE41-NEXT: psubd %xmm6, %xmm5 ; SSE41-NEXT: pminud %xmm5, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm9, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm6 ; SSE41-NEXT: psubd %xmm7, %xmm6 ; SSE41-NEXT: pminud %xmm6, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE41-NEXT: pxor %xmm9, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm3 ; SSE41-NEXT: movdqa %xmm6, 48(%rdi) ; SSE41-NEXT: movdqa %xmm5, 32(%rdi) -; SSE41-NEXT: movdqa %xmm4, 16(%rdi) +; SSE41-NEXT: movdqa %xmm9, 16(%rdi) ; SSE41-NEXT: movdqa %xmm8, (%rdi) ; SSE41-NEXT: retq ; @@ -1186,102 +1186,102 @@ define <2 x i32> @usubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind { ; SSE2-LABEL: usubo_v2i128: ; SSE2: # %bb.0: -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE2-NEXT: xorl %r11d, %r11d +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: xorl %r10d, %r10d ; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: sbbl %eax, %eax +; SSE2-NEXT: movl $0, %r11d +; SSE2-NEXT: sbbl %r11d, %r11d ; SSE2-NEXT: subq %r8, %rdi ; SSE2-NEXT: sbbq %r9, %rsi -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: sbbl %r11d, %r11d -; SSE2-NEXT: movd %r11d, %xmm0 +; SSE2-NEXT: movd %r11d, %xmm1 +; SSE2-NEXT: sbbl %r10d, %r10d +; SSE2-NEXT: movd %r10d, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %rdx, 16(%r10) -; SSE2-NEXT: movq %rdi, (%r10) -; SSE2-NEXT: movq %rcx, 24(%r10) -; SSE2-NEXT: movq %rsi, 8(%r10) +; SSE2-NEXT: movq %rdx, 16(%rax) +; SSE2-NEXT: movq %rdi, (%rax) +; SSE2-NEXT: movq %rcx, 24(%rax) +; SSE2-NEXT: movq %rsi, 8(%rax) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: usubo_v2i128: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSSE3-NEXT: xorl %r11d, %r11d +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSSE3-NEXT: xorl %r10d, %r10d ; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; SSSE3-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: movl $0, %eax -; SSSE3-NEXT: sbbl %eax, %eax +; SSSE3-NEXT: movl $0, %r11d +; SSSE3-NEXT: sbbl %r11d, %r11d ; SSSE3-NEXT: subq %r8, %rdi ; SSSE3-NEXT: sbbq %r9, %rsi -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: sbbl %r11d, %r11d -; SSSE3-NEXT: movd %r11d, %xmm0 +; SSSE3-NEXT: movd %r11d, %xmm1 +; SSSE3-NEXT: sbbl %r10d, %r10d +; SSSE3-NEXT: movd %r10d, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %rdx, 16(%r10) -; SSSE3-NEXT: movq %rdi, (%r10) -; SSSE3-NEXT: movq %rcx, 24(%r10) -; SSSE3-NEXT: movq %rsi, 8(%r10) +; SSSE3-NEXT: movq %rdx, 16(%rax) +; SSSE3-NEXT: movq %rdi, (%rax) +; SSSE3-NEXT: movq %rcx, 24(%rax) +; SSSE3-NEXT: movq %rsi, 8(%rax) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: usubo_v2i128: ; SSE41: # %bb.0: -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE41-NEXT: xorl %r11d, %r11d +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE41-NEXT: xorl %r10d, %r10d ; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; SSE41-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: movl $0, %eax -; SSE41-NEXT: sbbl %eax, %eax +; SSE41-NEXT: movl $0, %r11d +; SSE41-NEXT: sbbl %r11d, %r11d ; SSE41-NEXT: subq %r8, %rdi ; SSE41-NEXT: sbbq %r9, %rsi -; SSE41-NEXT: sbbl %r11d, %r11d -; SSE41-NEXT: movd %r11d, %xmm0 -; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: movq %rdx, 16(%r10) -; SSE41-NEXT: movq %rdi, (%r10) -; SSE41-NEXT: movq %rcx, 24(%r10) -; SSE41-NEXT: movq %rsi, 8(%r10) +; SSE41-NEXT: sbbl %r10d, %r10d +; SSE41-NEXT: movd %r10d, %xmm0 +; SSE41-NEXT: pinsrd $1, %r11d, %xmm0 +; SSE41-NEXT: movq %rdx, 16(%rax) +; SSE41-NEXT: movq %rdi, (%rax) +; SSE41-NEXT: movq %rcx, 24(%rax) +; SSE41-NEXT: movq %rsi, 8(%rax) ; SSE41-NEXT: retq ; ; AVX-LABEL: usubo_v2i128: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX-NEXT: xorl %r11d, %r11d +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: xorl %r10d, %r10d ; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movl $0, %eax -; AVX-NEXT: sbbl %eax, %eax +; AVX-NEXT: movl $0, %r11d +; AVX-NEXT: sbbl %r11d, %r11d ; AVX-NEXT: subq %r8, %rdi ; AVX-NEXT: sbbq %r9, %rsi -; AVX-NEXT: sbbl %r11d, %r11d -; AVX-NEXT: vmovd %r11d, %xmm0 -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq %rdx, 16(%r10) -; AVX-NEXT: movq %rdi, (%r10) -; AVX-NEXT: movq %rcx, 24(%r10) -; AVX-NEXT: movq %rsi, 8(%r10) +; AVX-NEXT: sbbl %r10d, %r10d +; AVX-NEXT: vmovd %r10d, %xmm0 +; AVX-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0 +; AVX-NEXT: movq %rdx, 16(%rax) +; AVX-NEXT: movq %rdi, (%rax) +; AVX-NEXT: movq %rcx, 24(%rax) +; AVX-NEXT: movq %rsi, 8(%rax) ; AVX-NEXT: retq ; ; AVX512-LABEL: usubo_v2i128: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: setb %al -; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: setb %r10b +; AVX512-NEXT: kmovd %r10d, %k0 ; AVX512-NEXT: subq %r8, %rdi ; AVX512-NEXT: sbbq %r9, %rsi -; AVX512-NEXT: setb %al -; AVX512-NEXT: andl $1, %eax -; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: setb %r8b +; AVX512-NEXT: andl $1, %r8d +; AVX512-NEXT: kmovw %r8d, %k1 ; AVX512-NEXT: kshiftlw $1, %k0, %k0 ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: movq %rdx, 16(%r10) -; AVX512-NEXT: movq %rdi, (%r10) -; AVX512-NEXT: movq %rcx, 24(%r10) -; AVX512-NEXT: movq %rsi, 8(%r10) +; AVX512-NEXT: movq %rdx, 16(%rax) +; AVX512-NEXT: movq %rdi, (%rax) +; AVX512-NEXT: movq %rcx, 24(%rax) +; AVX512-NEXT: movq %rsi, 8(%rax) ; AVX512-NEXT: retq %t = call {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -1569,8 +1569,8 @@ ; SSSE3-NEXT: movdqa %xmm0, %xmm5 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSSE3-NEXT: pand %xmm8, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; SSSE3-NEXT: movdqa %xmm7, %xmm6 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 ; SSSE3-NEXT: psrlw $4, %xmm5 ; SSSE3-NEXT: pand %xmm8, %xmm5 @@ -1580,7 +1580,7 @@ ; SSSE3-NEXT: por %xmm6, %xmm0 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 ; SSSE3-NEXT: pand %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: movdqa %xmm7, %xmm6 ; SSSE3-NEXT: pshufb %xmm5, %xmm6 ; SSSE3-NEXT: psrlw $4, %xmm1 ; SSSE3-NEXT: pand %xmm8, %xmm1 @@ -1589,20 +1589,20 @@ ; SSSE3-NEXT: por %xmm6, %xmm5 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: pand %xmm8, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm7 -; SSSE3-NEXT: pshufb %xmm1, %xmm7 +; SSSE3-NEXT: movdqa %xmm7, %xmm9 +; SSSE3-NEXT: pshufb %xmm1, %xmm9 ; SSSE3-NEXT: psrlw $4, %xmm2 ; SSSE3-NEXT: pand %xmm8, %xmm2 ; SSSE3-NEXT: movdqa %xmm4, %xmm6 ; SSSE3-NEXT: pshufb %xmm2, %xmm6 -; SSSE3-NEXT: por %xmm7, %xmm6 +; SSSE3-NEXT: por %xmm9, %xmm6 ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: pand %xmm8, %xmm1 -; SSSE3-NEXT: pshufb %xmm1, %xmm9 +; SSSE3-NEXT: pshufb %xmm1, %xmm7 ; SSSE3-NEXT: psrlw $4, %xmm3 ; SSSE3-NEXT: pand %xmm8, %xmm3 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: por %xmm9, %xmm4 +; SSSE3-NEXT: por %xmm7, %xmm4 ; SSSE3-NEXT: movdqa %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm6, %xmm2 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 @@ -1870,46 +1870,46 @@ ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; SSSE3-NEXT: pshufb %xmm8, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm9, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pshufb %xmm0, %xmm6 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pshufb %xmm0, %xmm9 ; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm9, %xmm1 +; SSSE3-NEXT: pand %xmm7, %xmm1 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; SSSE3-NEXT: movdqa %xmm4, %xmm0 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: por %xmm9, %xmm0 ; SSSE3-NEXT: pshufb %xmm8, %xmm5 ; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm9, %xmm1 -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pshufb %xmm1, %xmm6 +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pshufb %xmm1, %xmm9 ; SSSE3-NEXT: psrlw $4, %xmm5 -; SSSE3-NEXT: pand %xmm9, %xmm5 +; SSSE3-NEXT: pand %xmm7, %xmm5 ; SSSE3-NEXT: movdqa %xmm4, %xmm1 ; SSSE3-NEXT: pshufb %xmm5, %xmm1 -; SSSE3-NEXT: por %xmm6, %xmm1 +; SSSE3-NEXT: por %xmm9, %xmm1 ; SSSE3-NEXT: pshufb %xmm8, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pand %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pshufb %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm7, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pshufb %xmm5, %xmm9 ; SSSE3-NEXT: psrlw $4, %xmm2 -; SSSE3-NEXT: pand %xmm9, %xmm2 +; SSSE3-NEXT: pand %xmm7, %xmm2 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 ; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: por %xmm9, %xmm5 ; SSSE3-NEXT: pshufb %xmm8, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pand %xmm9, %xmm2 -; SSSE3-NEXT: pshufb %xmm2, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pshufb %xmm2, %xmm6 ; SSSE3-NEXT: psrlw $4, %xmm3 -; SSSE3-NEXT: pand %xmm9, %xmm3 +; SSSE3-NEXT: pand %xmm7, %xmm3 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: por %xmm7, %xmm4 +; SSSE3-NEXT: por %xmm6, %xmm4 ; SSSE3-NEXT: movdqa %xmm5, %xmm2 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: retq @@ -2106,12 +2106,12 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; SSE2-LABEL: test_bitreverse_v16i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; SSE2-NEXT: packuswb %xmm5, %xmm0 @@ -2129,73 +2129,73 @@ ; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: psllw $2, %xmm0 ; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: psrlw $1, %xmm8 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm7, %xmm8 ; SSE2-NEXT: pand %xmm7, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE2-NEXT: por %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: packuswb %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: psrlw $4, %xmm8 +; SSE2-NEXT: pand %xmm5, %xmm8 ; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: psllw $4, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrlw $2, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: por %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: psrlw $2, %xmm8 +; SSE2-NEXT: pand %xmm6, %xmm8 ; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: psllw $2, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm4 -; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: por %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: psrlw $1, %xmm8 +; SSE2-NEXT: pand %xmm7, %xmm8 ; SSE2-NEXT: pand %xmm7, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE2-NEXT: por %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: packuswb %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: psrlw $4, %xmm8 +; SSE2-NEXT: pand %xmm5, %xmm8 ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: psllw $4, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrlw $2, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: por %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: psrlw $2, %xmm8 +; SSE2-NEXT: pand %xmm6, %xmm8 ; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: psllw $2, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm4 -; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: por %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: psrlw $1, %xmm8 +; SSE2-NEXT: pand %xmm7, %xmm8 ; SSE2-NEXT: pand %xmm7, %xmm2 ; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE2-NEXT: por %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm8, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: psrlw $4, %xmm4 ; SSE2-NEXT: pand %xmm5, %xmm4 @@ -2222,46 +2222,46 @@ ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; SSSE3-NEXT: pshufb %xmm8, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm9, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pshufb %xmm0, %xmm6 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pshufb %xmm0, %xmm9 ; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm9, %xmm1 +; SSSE3-NEXT: pand %xmm7, %xmm1 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; SSSE3-NEXT: movdqa %xmm4, %xmm0 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: por %xmm9, %xmm0 ; SSSE3-NEXT: pshufb %xmm8, %xmm5 ; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm9, %xmm1 -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pshufb %xmm1, %xmm6 +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pshufb %xmm1, %xmm9 ; SSSE3-NEXT: psrlw $4, %xmm5 -; SSSE3-NEXT: pand %xmm9, %xmm5 +; SSSE3-NEXT: pand %xmm7, %xmm5 ; SSSE3-NEXT: movdqa %xmm4, %xmm1 ; SSSE3-NEXT: pshufb %xmm5, %xmm1 -; SSSE3-NEXT: por %xmm6, %xmm1 +; SSSE3-NEXT: por %xmm9, %xmm1 ; SSSE3-NEXT: pshufb %xmm8, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pand %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pshufb %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm7, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pshufb %xmm5, %xmm9 ; SSSE3-NEXT: psrlw $4, %xmm2 -; SSSE3-NEXT: pand %xmm9, %xmm2 +; SSSE3-NEXT: pand %xmm7, %xmm2 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 ; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: por %xmm9, %xmm5 ; SSSE3-NEXT: pshufb %xmm8, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pand %xmm9, %xmm2 -; SSSE3-NEXT: pshufb %xmm2, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pshufb %xmm2, %xmm6 ; SSSE3-NEXT: psrlw $4, %xmm3 -; SSSE3-NEXT: pand %xmm9, %xmm3 +; SSSE3-NEXT: pand %xmm7, %xmm3 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: por %xmm7, %xmm4 +; SSSE3-NEXT: por %xmm6, %xmm4 ; SSSE3-NEXT: movdqa %xmm5, %xmm2 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: retq @@ -2458,13 +2458,13 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; SSE2-LABEL: test_bitreverse_v8i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] @@ -2483,79 +2483,79 @@ ; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: psllw $2, %xmm0 ; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: psrlw $1, %xmm8 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm7, %xmm8 ; SSE2-NEXT: pand %xmm7, %xmm0 ; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE2-NEXT: por %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: packuswb %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: psrlw $4, %xmm8 +; SSE2-NEXT: pand %xmm5, %xmm8 ; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: psllw $4, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrlw $2, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: por %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: psrlw $2, %xmm8 +; SSE2-NEXT: pand %xmm6, %xmm8 ; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: psllw $2, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm4 -; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: por %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: psrlw $1, %xmm8 +; SSE2-NEXT: pand %xmm7, %xmm8 ; SSE2-NEXT: pand %xmm7, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE2-NEXT: por %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: packuswb %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: psrlw $4, %xmm8 +; SSE2-NEXT: pand %xmm5, %xmm8 ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: psllw $4, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrlw $2, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: por %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: psrlw $2, %xmm8 +; SSE2-NEXT: pand %xmm6, %xmm8 ; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: psllw $2, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm4 -; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: por %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: psrlw $1, %xmm8 +; SSE2-NEXT: pand %xmm7, %xmm8 ; SSE2-NEXT: pand %xmm7, %xmm2 ; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE2-NEXT: por %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] -; SSE2-NEXT: packuswb %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm8, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: psrlw $4, %xmm4 ; SSE2-NEXT: pand %xmm5, %xmm4 @@ -2582,46 +2582,46 @@ ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; SSSE3-NEXT: pshufb %xmm8, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm9, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pshufb %xmm0, %xmm6 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pshufb %xmm0, %xmm9 ; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand %xmm9, %xmm1 +; SSSE3-NEXT: pand %xmm7, %xmm1 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; SSSE3-NEXT: movdqa %xmm4, %xmm0 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: por %xmm9, %xmm0 ; SSSE3-NEXT: pshufb %xmm8, %xmm5 ; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm9, %xmm1 -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pshufb %xmm1, %xmm6 +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pshufb %xmm1, %xmm9 ; SSSE3-NEXT: psrlw $4, %xmm5 -; SSSE3-NEXT: pand %xmm9, %xmm5 +; SSSE3-NEXT: pand %xmm7, %xmm5 ; SSSE3-NEXT: movdqa %xmm4, %xmm1 ; SSSE3-NEXT: pshufb %xmm5, %xmm1 -; SSSE3-NEXT: por %xmm6, %xmm1 +; SSSE3-NEXT: por %xmm9, %xmm1 ; SSSE3-NEXT: pshufb %xmm8, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pand %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pshufb %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm7, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pshufb %xmm5, %xmm9 ; SSSE3-NEXT: psrlw $4, %xmm2 -; SSSE3-NEXT: pand %xmm9, %xmm2 +; SSSE3-NEXT: pand %xmm7, %xmm2 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 ; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: por %xmm9, %xmm5 ; SSSE3-NEXT: pshufb %xmm8, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pand %xmm9, %xmm2 -; SSSE3-NEXT: pshufb %xmm2, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pshufb %xmm2, %xmm6 ; SSSE3-NEXT: psrlw $4, %xmm3 -; SSSE3-NEXT: pand %xmm9, %xmm3 +; SSSE3-NEXT: pand %xmm7, %xmm3 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: por %xmm7, %xmm4 +; SSSE3-NEXT: por %xmm6, %xmm4 ; SSSE3-NEXT: movdqa %xmm5, %xmm2 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll --- a/llvm/test/CodeGen/X86/vector-compare-results.ll +++ b/llvm/test/CodeGen/X86/vector-compare-results.ll @@ -1109,33 +1109,33 @@ ; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSE2-NEXT: pand %xmm11, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm10 -; SSE2-NEXT: packssdw %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: packssdw %xmm9, %xmm6 ; SSE2-NEXT: pxor %xmm8, %xmm5 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm7 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: packssdw %xmm7, %xmm4 ; SSE2-NEXT: packssdw %xmm6, %xmm4 -; SSE2-NEXT: packssdw %xmm10, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm3 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm5 @@ -1210,13 +1210,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9 ; AVX1-NEXT: vpcmpgtq %xmm8, %xmm9, %xmm8 ; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm8 +; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpackssdw %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm8, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3 @@ -1313,13 +1313,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9 ; AVX1-NEXT: vpcmpgtd %xmm8, %xmm9, %xmm8 ; AVX1-NEXT: vpcmpgtd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm8 +; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 +; AVX1-NEXT: vpcmpgtd %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpcmpgtd %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpacksswb %xmm8, %xmm2, %xmm2 +; AVX1-NEXT: vpackssdw %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3 @@ -1802,13 +1802,13 @@ ; AVX1-NEXT: vmovapd 176(%rbp), %ymm13 ; AVX1-NEXT: vmovapd 208(%rbp), %ymm14 ; AVX1-NEXT: vmovapd 240(%rbp), %ymm15 -; AVX1-NEXT: vcmpltpd %ymm7, %ymm15, %ymm15 -; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm7 -; AVX1-NEXT: vpackssdw %xmm7, %xmm15, %xmm15 +; AVX1-NEXT: vcmpltpd %ymm7, %ymm15, %ymm7 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm15 +; AVX1-NEXT: vpackssdw %xmm15, %xmm7, %xmm7 ; AVX1-NEXT: vcmpltpd %ymm6, %ymm14, %ymm6 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm14 +; AVX1-NEXT: vpackssdw %xmm14, %xmm6, %xmm6 ; AVX1-NEXT: vpackssdw %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpackssdw %xmm15, %xmm6, %xmm6 ; AVX1-NEXT: vcmpltpd %ymm5, %ymm13, %ymm5 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7 ; AVX1-NEXT: vpackssdw %xmm7, %xmm5, %xmm5 @@ -1948,33 +1948,33 @@ ; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSE2-NEXT: pand %xmm11, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm10 -; SSE2-NEXT: packssdw %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: packssdw %xmm9, %xmm6 ; SSE2-NEXT: pxor %xmm8, %xmm5 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 ; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm7 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: packssdw %xmm7, %xmm4 ; SSE2-NEXT: packssdw %xmm6, %xmm4 -; SSE2-NEXT: packssdw %xmm10, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm3 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm5 @@ -2189,12 +2189,12 @@ ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 ; AVX1-NEXT: vpcmpgtq 256(%rbp), %xmm8, %xmm8 ; AVX1-NEXT: vpcmpgtq 240(%rbp), %xmm7, %xmm7 -; AVX1-NEXT: vpackssdw %xmm8, %xmm7, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 -; AVX1-NEXT: vpcmpgtq 224(%rbp), %xmm7, %xmm7 +; AVX1-NEXT: vpackssdw %xmm8, %xmm7, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-NEXT: vpcmpgtq 224(%rbp), %xmm8, %xmm8 ; AVX1-NEXT: vpcmpgtq 208(%rbp), %xmm6, %xmm6 -; AVX1-NEXT: vpackssdw %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpackssdw %xmm8, %xmm6, %xmm6 +; AVX1-NEXT: vpackssdw %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7 ; AVX1-NEXT: vpcmpgtq 192(%rbp), %xmm7, %xmm7 ; AVX1-NEXT: vpcmpgtq 176(%rbp), %xmm5, %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -237,26 +237,26 @@ ; ; SSE41-LABEL: var_funnnel_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] ; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pandn %xmm8, %xmm4 +; SSE41-NEXT: pandn %xmm3, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] ; SSE41-NEXT: psrld $1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm6 ; SSE41-NEXT: psrld %xmm5, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psrld %xmm7, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm8 +; SSE41-NEXT: psrld %xmm7, %xmm8 +; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm6 ; SSE41-NEXT: psrld %xmm4, %xmm6 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm4, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7] -; SSE41-NEXT: pand %xmm8, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7] +; SSE41-NEXT: pand %xmm3, %xmm2 ; SSE41-NEXT: pslld $23, %xmm2 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -163,44 +163,44 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [31,31,31,31] -; AVX1-NEXT: vpxor %xmm3, %xmm8, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31] +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; AVX1-NEXT: vpsrld $1, %xmm7, %xmm7 ; AVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm4 -; AVX1-NEXT: vpsrld %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm8 +; AVX1-NEXT: vpsrld %xmm8, %xmm7, %xmm8 +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3],xmm6[4,5,6,7] +; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; AVX1-NEXT: vpsrld %xmm9, %xmm7, %xmm9 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero ; AVX1-NEXT: vpsrld %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm9[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] -; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpmulld %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm2, %xmm8, %xmm4 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT: vpmulld %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 +; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm7 ; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7 -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] ; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -479,65 +479,65 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [1065353216,1065353216,1065353216,1065353216] -; AVX1-NEXT: vpaddd %xmm7, %xmm9, %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm7 ; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; AVX1-NEXT: vpslld $23, %xmm6, %xmm6 -; AVX1-NEXT: vpaddd %xmm6, %xmm9, %xmm6 +; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6 ; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6 ; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; AVX1-NEXT: vpmullw %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpslld $23, %xmm6, %xmm6 -; AVX1-NEXT: vpaddd %xmm6, %xmm9, %xmm6 -; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6 +; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpslld $23, %xmm9, %xmm9 +; AVX1-NEXT: vpaddd %xmm3, %xmm9, %xmm9 +; AVX1-NEXT: vcvttps2dq %xmm9, %xmm9 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm9, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 -; AVX1-NEXT: vpackusdw %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm9, %xmm4 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpslld $23, %xmm5, %xmm5 -; AVX1-NEXT: vpaddd %xmm5, %xmm9, %xmm5 -; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm9, %xmm4 -; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 -; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpslld $23, %xmm6, %xmm6 +; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6 +; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5 -; AVX1-NEXT: vpaddd %xmm5, %xmm9, %xmm5 +; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm5 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5 +; AVX1-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpslld $23, %xmm6, %xmm6 +; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6 +; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm9, %xmm2 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_funnnel_v32i8: @@ -1258,7 +1258,7 @@ ; AVX1-NEXT: vmovd %edx, %xmm1 ; AVX1-NEXT: vmovd %ecx, %xmm2 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,0,0,0] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 @@ -1266,32 +1266,32 @@ ; AVX1-NEXT: .LBB8_1: # %loop ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm8, %xmm3 -; AVX1-NEXT: vpmovsxbd %xmm3, %xmm10 +; AVX1-NEXT: vpcmpeqb %xmm0, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm3, %xmm9 +; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vmovdqu 4096(%rdi,%rax,4), %xmm5 ; AVX1-NEXT: vmovdqu 4112(%rdi,%rax,4), %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,2,3,3] -; AVX1-NEXT: vpsllq %xmm1, %xmm7, %xmm0 +; AVX1-NEXT: vpsllq %xmm1, %xmm7, %xmm8 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm5, %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm11 = xmm3[1,3],xmm0[1,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,2,3,3] -; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpsllq %xmm1, %xmm5, %xmm9 +; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm9[1,3],xmm8[1,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,2,3,3] +; AVX1-NEXT: vpsllq %xmm1, %xmm9, %xmm10 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm6, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] -; AVX1-NEXT: vpsllq %xmm2, %xmm7, %xmm4 +; AVX1-NEXT: vpsllq %xmm1, %xmm6, %xmm11 +; AVX1-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,3],xmm10[1,3] +; AVX1-NEXT: vpsllq %xmm2, %xmm7, %xmm7 ; AVX1-NEXT: vpsllq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm5[1,3],xmm4[1,3] -; AVX1-NEXT: vblendvps %xmm10, %xmm11, %xmm4, %xmm4 -; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpsllq %xmm2, %xmm6, %xmm5 -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,3],xmm3[1,3] -; AVX1-NEXT: vblendvps %xmm9, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm7[1,3] +; AVX1-NEXT: vblendvps %xmm4, %xmm8, %xmm5, %xmm4 +; AVX1-NEXT: vpsllq %xmm2, %xmm9, %xmm5 +; AVX1-NEXT: vpsllq %xmm2, %xmm6, %xmm6 +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm6[1,3],xmm5[1,3] +; AVX1-NEXT: vblendvps %xmm3, %xmm10, %xmm5, %xmm3 ; AVX1-NEXT: vmovups %xmm4, 4096(%rdi,%rax,4) -; AVX1-NEXT: vmovups %xmm0, 4112(%rdi,%rax,4) +; AVX1-NEXT: vmovups %xmm3, 4112(%rdi,%rax,4) ; AVX1-NEXT: addq $8, %rax ; AVX1-NEXT: jne .LBB8_1 ; AVX1-NEXT: # %bb.2: # %exit diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -398,10 +398,10 @@ ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm7 -; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm8 +; AVX1-NEXT: vpor %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 @@ -419,7 +419,7 @@ ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -238,25 +238,25 @@ ; ; SSE41-LABEL: var_funnnel_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] ; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm8, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm6 ; SSE41-NEXT: psrld %xmm5, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psrld %xmm7, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm8 +; SSE41-NEXT: psrld %xmm7, %xmm8 +; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm6 ; SSE41-NEXT: psrld %xmm4, %xmm6 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm4, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7] -; SSE41-NEXT: pandn %xmm8, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7] +; SSE41-NEXT: pandn %xmm3, %xmm2 ; SSE41-NEXT: pslld $23, %xmm2 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -170,37 +170,37 @@ ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6 ; AVX1-NEXT: vpsrld %xmm6, %xmm3, %xmm6 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; AVX1-NEXT: vpsrld %xmm7, %xmm3, %xmm7 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero -; AVX1-NEXT: vpsrld %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero +; AVX1-NEXT: vpsrld %xmm8, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3],xmm3[4,5],xmm5[6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [31,31,31,31] -; AVX1-NEXT: vpxor %xmm4, %xmm9, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [31,31,31,31] +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] -; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpaddd %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpmulld %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 +; AVX1-NEXT: vpaddd %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpmulld %xmm4, %xmm8, %xmm4 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm7 -; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7] +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm8 +; AVX1-NEXT: vpsrld %xmm8, %xmm1, %xmm8 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero +; AVX1-NEXT: vpsrld %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; AVX1-NEXT: vpxor %xmm2, %xmm9, %xmm2 +; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 @@ -335,8 +335,8 @@ ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm6 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpxor %xmm3, %xmm8, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm6 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm7 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] @@ -355,17 +355,17 @@ ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm7 ; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm7 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm5 -; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 -; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm5 -; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm6 -; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm5 -; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm8 +; AVX1-NEXT: vpblendvb %xmm6, %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm6 +; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm6 +; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm6 +; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5 ; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm5 @@ -512,66 +512,66 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm5 ; AVX1-NEXT: vpsllw $4, %xmm5, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm6 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX1-NEXT: vpxor %xmm7, %xmm9, %xmm3 -; AVX1-NEXT: vpsllw $5, %xmm3, %xmm3 -; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm6 -; AVX1-NEXT: vpsllw $2, %xmm6, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm4 -; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm6 -; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm4, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm4, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm6, %xmm11, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm8 +; AVX1-NEXT: vpsllw $5, %xmm8, %xmm8 +; AVX1-NEXT: vpblendvb %xmm8, %xmm6, %xmm5, %xmm6 +; AVX1-NEXT: vpsllw $2, %xmm6, %xmm9 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm9 +; AVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpblendvb %xmm8, %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm9 +; AVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpblendvb %xmm8, %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8 +; AVX1-NEXT: vpsrlw $4, %xmm8, %xmm9 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm10, %xmm9, %xmm9 ; AVX1-NEXT: vpsllw $5, %xmm7, %xmm7 -; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw $2, %xmm4, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX1-NEXT: vpand %xmm6, %xmm12, %xmm6 +; AVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm8 +; AVX1-NEXT: vpsrlw $2, %xmm8, %xmm9 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX1-NEXT: vpand %xmm11, %xmm9, %xmm9 ; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm8 +; AVX1-NEXT: vpsrlw $1, %xmm8, %xmm9 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm12, %xmm9, %xmm9 ; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm7 +; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm4 -; AVX1-NEXT: vpxor %xmm2, %xmm9, %xmm6 -; AVX1-NEXT: vpsllw $5, %xmm6, %xmm6 -; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm4 -; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4 -; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm11, %xmm4 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm7 +; AVX1-NEXT: vpand %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4 +; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm3 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm10, %xmm3 ; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm12, %xmm4 +; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm11, %xmm3 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm12, %xmm3 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_funnnel_v32i8: @@ -1599,43 +1599,43 @@ ; AVX1-LABEL: constant_funnnel_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [256,2,4,8,16,32,64,128] -; AVX1-NEXT: vpmullw %xmm4, %xmm9, %xmm4 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [256,2,4,8,16,32,64,128] +; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [256,128,64,32,16,8,4,2] -; AVX1-NEXT: vpmullw %xmm2, %xmm10, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [256,128,64,32,16,8,4,2] +; AVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,1,2,4,8,16,32,64] -; AVX1-NEXT: vpmullw %xmm3, %xmm7, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [128,1,2,4,8,16,32,64] +; AVX1-NEXT: vpmullw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm7 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [128,64,32,16,8,4,2,1] -; AVX1-NEXT: vpmullw %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [128,64,32,16,8,4,2,1] +; AVX1-NEXT: vpmullw %xmm4, %xmm10, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm9, %xmm4 ; AVX1-NEXT: vpackuswb %xmm7, %xmm4, %xmm4 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; AVX1-NEXT: vpmullw %xmm4, %xmm9, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm1, %xmm10, %xmm1 +; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm0, %xmm10, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -134,8 +134,8 @@ ; AVX1-LABEL: var_funnnel_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpsubd %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31] ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 @@ -144,15 +144,15 @@ ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm6, %xmm8, %xmm6 ; AVX1-NEXT: vpmuludq %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3],xmm2[4,5],xmm6[6,7] +; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1 @@ -408,23 +408,23 @@ ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5 ; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpsubb %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpsubb %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX1-NEXT: vpandn %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpsllw $2, %xmm2, %xmm6 -; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpsllw $2, %xmm2, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 +; AVX1-NEXT: vpor %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm6 -; AVX1-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm9 +; AVX1-NEXT: vpor %xmm3, %xmm9, %xmm3 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 @@ -432,7 +432,7 @@ ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsubb %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpsubb %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm3 @@ -443,7 +443,7 @@ ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -555,18 +555,18 @@ ; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX1-NEXT: vpxor %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpsubb %xmm8, %xmm3, %xmm3 ; AVX1-NEXT: vpsllw $3, %xmm3, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm5 ; AVX1-NEXT: vpsubb %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] @@ -578,14 +578,14 @@ ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm9, %xmm2 -; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm8, %xmm2 ; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsubb %xmm8, %xmm2, %xmm2 ; AVX1-NEXT: vpsllw $3, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3 ; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -578,8 +578,8 @@ ; AVX1-LABEL: test_rem7_32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37] ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 @@ -596,26 +596,26 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpsllw $3, %xmm3, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] -; AVX1-NEXT: vpand %xmm2, %xmm7, %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpsllw $3, %xmm3, %xmm4 -; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $3, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll --- a/llvm/test/CodeGen/X86/vector-interleave.ll +++ b/llvm/test/CodeGen/X86/vector-interleave.ll @@ -14,17 +14,17 @@ ; SSE-NEXT: movdqa %xmm0, %xmm8 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; SSE-NEXT: movdqa %xmm7, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] @@ -32,15 +32,15 @@ ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] @@ -51,8 +51,8 @@ ; SSE-NEXT: movdqa %xmm4, 96(%rdi) ; SSE-NEXT: movdqa %xmm0, 80(%rdi) ; SSE-NEXT: movdqa %xmm7, 64(%rdi) -; SSE-NEXT: movdqa %xmm2, 48(%rdi) -; SSE-NEXT: movdqa %xmm1, 32(%rdi) +; SSE-NEXT: movdqa %xmm1, 48(%rdi) +; SSE-NEXT: movdqa %xmm2, 32(%rdi) ; SSE-NEXT: movdqa %xmm8, 16(%rdi) ; SSE-NEXT: movdqa %xmm5, (%rdi) ; SSE-NEXT: retq @@ -63,9 +63,9 @@ ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] @@ -78,15 +78,15 @@ ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleave8x8: @@ -95,9 +95,9 @@ ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] @@ -110,15 +110,15 @@ ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX2-NEXT: retq %ab = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> %cd = shufflevector <8 x i16> %c, <8 x i16> %d, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -263,57 +263,57 @@ ; SSE-LABEL: vf32: ; SSE: # %bb.0: ; SSE-NEXT: movdqa 64(%rdi), %xmm1 -; SSE-NEXT: movdqa 80(%rdi), %xmm10 +; SSE-NEXT: movdqa 80(%rdi), %xmm4 ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa 112(%rdi), %xmm7 ; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 ; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,2,2,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm4[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,2,2,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm4[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,2,2,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm6[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; SSE-NEXT: psrad $16, %xmm5 +; SSE-NEXT: movdqa 48(%rdi), %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm8[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm11[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm11[0] +; SSE-NEXT: psrad $16, %xmm9 ; SSE-NEXT: psrad $16, %xmm2 -; SSE-NEXT: packssdw %xmm5, %xmm2 +; SSE-NEXT: packssdw %xmm9, %xmm2 ; SSE-NEXT: psrad $16, %xmm7 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm7, %xmm0 -; SSE-NEXT: psrad $16, %xmm11 +; SSE-NEXT: psrad $16, %xmm6 ; SSE-NEXT: psrad $16, %xmm3 -; SSE-NEXT: packssdw %xmm11, %xmm3 -; SSE-NEXT: psrad $16, %xmm10 +; SSE-NEXT: packssdw %xmm6, %xmm3 +; SSE-NEXT: psrad $16, %xmm4 ; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: packssdw %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm4, 32(%rsi) -; SSE-NEXT: movdqa %xmm12, (%rsi) -; SSE-NEXT: movdqa %xmm9, 48(%rsi) -; SSE-NEXT: movdqa %xmm8, 16(%rsi) +; SSE-NEXT: packssdw %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm12, 32(%rsi) +; SSE-NEXT: movdqa %xmm10, (%rsi) +; SSE-NEXT: movdqa %xmm8, 48(%rsi) +; SSE-NEXT: movdqa %xmm5, 16(%rsi) ; SSE-NEXT: movdqa %xmm1, 32(%rdx) ; SSE-NEXT: movdqa %xmm3, (%rdx) ; SSE-NEXT: movdqa %xmm0, 48(%rdx) @@ -323,46 +323,46 @@ ; AVX1-LABEL: vf32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa 112(%rdi), %xmm10 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm0[1],xmm10[2],xmm0[3],xmm10[4],xmm0[5],xmm10[6],xmm0[7] -; AVX1-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm0[1],xmm11[2],xmm0[3],xmm11[4],xmm0[5],xmm11[6],xmm0[7] -; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm8 -; AVX1-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm0[1],xmm12[2],xmm0[3],xmm12[4],xmm0[5],xmm12[6],xmm0[7] +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX1-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm6 ; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4],xmm0[5],xmm6[6],xmm0[7] -; AVX1-NEXT: vpackusdw %xmm5, %xmm7, %xmm9 +; AVX1-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vmovdqa (%rdi), %xmm7 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] -; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0],xmm0[1],xmm10[2],xmm0[3],xmm10[4],xmm0[5],xmm10[6],xmm0[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0],xmm0[1],xmm9[2],xmm0[3],xmm9[4],xmm0[5],xmm9[6],xmm0[7] +; AVX1-NEXT: vpackusdw %xmm11, %xmm12, %xmm11 +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm0[1],xmm8[2],xmm0[3],xmm8[4],xmm0[5],xmm8[6],xmm0[7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] -; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm12, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm10, %xmm10 +; AVX1-NEXT: vpsrld $16, %xmm9, %xmm9 +; AVX1-NEXT: vpackusdw %xmm10, %xmm9, %xmm9 +; AVX1-NEXT: vpsrld $16, %xmm8, %xmm8 +; AVX1-NEXT: vpsrld $16, %xmm7, %xmm7 +; AVX1-NEXT: vpackusdw %xmm8, %xmm7, %xmm7 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm5, %xmm4 -; AVX1-NEXT: vpackusdw %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $16, %xmm7, %xmm4 -; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsrld $16, %xmm10, %xmm4 -; AVX1-NEXT: vpsrld $16, %xmm11, %xmm5 -; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrld $16, %xmm12, %xmm5 -; AVX1-NEXT: vpsrld $16, %xmm6, %xmm6 -; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm4, %xmm3 +; AVX1-NEXT: vpsrld $16, %xmm6, %xmm4 +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: vmovdqa %xmm3, 16(%rsi) -; AVX1-NEXT: vmovdqa %xmm9, 32(%rsi) -; AVX1-NEXT: vmovdqa %xmm8, 48(%rsi) -; AVX1-NEXT: vmovdqa %xmm5, 32(%rdx) -; AVX1-NEXT: vmovdqa %xmm4, 48(%rdx) -; AVX1-NEXT: vmovdqa %xmm2, (%rdx) -; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX1-NEXT: vmovdqa %xmm11, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm5, 32(%rsi) +; AVX1-NEXT: vmovdqa %xmm2, 48(%rsi) +; AVX1-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX1-NEXT: vmovdqa %xmm7, (%rdx) +; AVX1-NEXT: vmovdqa %xmm9, 16(%rdx) ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: vf32: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -197,7 +197,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pand %xmm1, %xmm4 @@ -208,7 +208,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] @@ -225,12 +225,12 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: por %xmm5, %xmm8 ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 @@ -239,12 +239,12 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] ; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: por %xmm2, %xmm6 ; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movdqa %xmm0, (%rdx) +; SSE-NEXT: movdqa %xmm8, (%rdx) ; SSE-NEXT: movdqa %xmm6, (%rcx) ; SSE-NEXT: retq ; @@ -321,107 +321,107 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 80(%rdi), %xmm8 -; SSE-NEXT: movdqa 64(%rdi), %xmm11 -; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa 80(%rdi), %xmm0 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm7 ; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa 32(%rdi), %xmm10 -; SSE-NEXT: movdqa 48(%rdi), %xmm9 +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[2,0] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm5[2,0] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,0] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,0] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: por %xmm8, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: por %xmm8, %xmm13 ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,7,6,7] +; SSE-NEXT: por %xmm10, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm6 -; SSE-NEXT: por %xmm11, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movaps %xmm13, 16(%rsi) -; SSE-NEXT: movaps %xmm12, (%rsi) -; SSE-NEXT: movdqa %xmm2, 16(%rdx) -; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm5, 16(%rcx) -; SSE-NEXT: movdqa %xmm4, (%rcx) +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movaps %xmm5, 16(%rsi) +; SSE-NEXT: movaps %xmm7, (%rsi) +; SSE-NEXT: movdqa %xmm13, 16(%rdx) +; SSE-NEXT: movdqa %xmm12, (%rdx) +; SSE-NEXT: movdqa %xmm9, 16(%rcx) +; SSE-NEXT: movdqa %xmm8, (%rcx) ; SSE-NEXT: retq ; ; AVX1-LABEL: vf16: @@ -436,21 +436,21 @@ ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,3,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2],xmm2[3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,1,2,1] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm7[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm8 +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5],xmm7[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[2,3,8,9,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm7[3,4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm7[5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[2,3,8,9,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3,4],xmm8[5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] @@ -460,9 +460,9 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX1-NEXT: vmovaps %ymm8, (%rsi) -; AVX1-NEXT: vmovdqa %xmm2, (%rdx) -; AVX1-NEXT: vmovdqa %xmm9, 16(%rdx) +; AVX1-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-NEXT: vmovdqa %xmm8, (%rdx) +; AVX1-NEXT: vmovdqa %xmm7, 16(%rdx) ; AVX1-NEXT: vmovdqa %xmm1, (%rcx) ; AVX1-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX1-NEXT: vzeroupper @@ -542,398 +542,404 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $72, %rsp -; SSE-NEXT: movdqa 96(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm7 -; SSE-NEXT: movdqa 144(%rdi), %xmm9 -; SSE-NEXT: movdqa 160(%rdi), %xmm5 -; SSE-NEXT: movdqa 80(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa 32(%rdi), %xmm13 +; SSE-NEXT: subq $56, %rsp +; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm14 +; SSE-NEXT: movdqa 144(%rdi), %xmm12 +; SSE-NEXT: movdqa 160(%rdi), %xmm10 +; SSE-NEXT: movdqa 80(%rdi), %xmm13 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm8 -; SSE-NEXT: movdqa 64(%rdi), %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa 64(%rdi), %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0] ; SSE-NEXT: movdqa 112(%rdi), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa 128(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm13 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm9, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: pandn %xmm12, %xmm9 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm7, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm14, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pandn %xmm8, %xmm14 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm14, %xmm0 -; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: pandn %xmm6, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,3,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm12, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pandn %xmm10, %xmm12 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm14[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm9, %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm10[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,7,6,7] +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,1,2,0] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,0] +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm15 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm14, (%rdx) -; SSE-NEXT: movdqa %xmm9, 48(%rdx) -; SSE-NEXT: movdqa %xmm10, 16(%rdx) -; SSE-NEXT: movdqa %xmm1, 32(%rcx) -; SSE-NEXT: movdqa %xmm6, (%rcx) -; SSE-NEXT: movdqa %xmm4, 48(%rcx) -; SSE-NEXT: movdqa %xmm5, 16(%rcx) -; SSE-NEXT: addq $72, %rsp +; SSE-NEXT: movdqa %xmm1, 32(%rdx) +; SSE-NEXT: movdqa %xmm9, (%rdx) +; SSE-NEXT: movdqa %xmm12, 48(%rdx) +; SSE-NEXT: movdqa %xmm7, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: movdqa %xmm5, (%rcx) +; SSE-NEXT: movdqa %xmm3, 48(%rcx) +; SSE-NEXT: movdqa %xmm4, 16(%rcx) +; SSE-NEXT: addq $56, %rsp ; SSE-NEXT: retq ; ; AVX1-LABEL: vf32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa 176(%rdi), %xmm9 -; AVX1-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm9[2],xmm11[3,4],xmm9[5],xmm11[6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa 144(%rdi), %xmm13 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,3,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] -; AVX1-NEXT: vmovdqa 112(%rdi), %xmm10 -; AVX1-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm3 -; AVX1-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX1-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm12 -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,3,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm12[3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,2,1] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm8[2],xmm14[3,4],xmm8[5],xmm14[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,3,8,9,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm6[3,4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm12 -; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm6[0,1,2,3,4],xmm12[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] -; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[2,3,8,9,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4],xmm0[5,6,7] +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa 160(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm3 +; AVX1-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5,6,7] +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX1-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6],xmm9[7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7] -; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX1-NEXT: vmovdqa %xmm6, 48(%rdx) -; AVX1-NEXT: vmovdqa %xmm5, 32(%rdx) -; AVX1-NEXT: vmovdqa %xmm12, (%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm4, 16(%rdx) -; AVX1-NEXT: vmovdqa %xmm1, (%rcx) -; AVX1-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX1-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX1-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,2,1] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm0 +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm4 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] +; AVX1-NEXT: vpshufb %xmm8, %xmm11, %xmm12 +; AVX1-NEXT: vmovdqa (%rdi), %xmm11 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm15 = xmm8[0,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm11[0],xmm13[1],xmm11[2,3],xmm13[4],xmm11[5,6],xmm13[7] +; AVX1-NEXT: vpshufb %xmm10, %xmm15, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,2,1] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm15[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm0 +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX1-NEXT: vpshufb %xmm10, %xmm12, %xmm12 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <2,3,8,9,14,15,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm8, %xmm15 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm12[3,4,5,6,7] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm12 +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1,2,3,4],xmm12[5,6,7] +; AVX1-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] +; AVX1-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm12[2],xmm0[3,4],xmm12[5],xmm0[6,7] +; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-NEXT: vpshufb %xmm6, %xmm9, %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm7[5,6,7] +; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm6 +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7] +; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3,4],xmm6[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0],xmm0[1],xmm12[2,3],xmm0[4],xmm12[5,6],xmm0[7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] +; AVX1-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5,6,7] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX1-NEXT: vmovdqa %xmm15, 32(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm0, (%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm0, 16(%rdx) +; AVX1-NEXT: vmovdqa %xmm6, (%rcx) +; AVX1-NEXT: vmovdqa %xmm4, 16(%rcx) +; AVX1-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX1-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX1-NEXT: popq %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm11 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm5 -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13,14],ymm6[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5,6],ymm5[7],ymm2[8],ymm5[9],ymm2[10,11],ymm5[12],ymm2[13,14],ymm5[15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa 176(%rdi), %xmm5 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm6 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,4,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpblendvb %ymm9, %ymm11, %ymm15, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5,6],ymm9[7],ymm2[8],ymm9[9],ymm2[10,11],ymm9[12],ymm2[13,14],ymm9[15] -; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm9 +; AVX2-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm7 +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15] +; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm7 -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm7[2],xmm2[3,4],xmm7[5],xmm2[6,7] -; AVX2-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7] +; AVX2-NEXT: vpshufb %xmm10, %xmm11, %xmm10 +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-NEXT: vpblendvb %ymm13, %ymm4, %ymm3, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-NEXT: vpblendvb %ymm11, %ymm4, %ymm3, %ymm10 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] ; AVX2-NEXT: vpshufb %ymm12, %ymm10, %ymm10 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX2-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX2-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1,2],ymm13[3,4,5,6,7],ymm10[8,9,10],ymm13[11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpblendvb %ymm13, %ymm15, %ymm11, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7,8,9],ymm13[10],ymm0[11,12],ymm13[13],ymm0[14,15] -; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm2[2],xmm7[3,4],xmm2[5],xmm7[6,7] -; AVX2-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 +; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7,8,9],ymm13[10],ymm11[11,12],ymm13[13],ymm11[14,15] +; AVX2-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] +; AVX2-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] @@ -943,20 +949,20 @@ ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] -; AVX2-NEXT: vpblendvb %ymm1, %ymm15, %ymm11, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7],ymm5[8],ymm1[9,10],ymm5[11],ymm1[12,13],ymm5[14],ymm1[15] -; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3],xmm7[4],xmm2[5,6],xmm7[7] -; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqa %ymm9, (%rsi) -; AVX2-NEXT: vmovdqa %ymm8, 32(%rsi) +; AVX2-NEXT: vmovdqa %ymm2, 32(%rsi) ; AVX2-NEXT: vmovdqa %ymm10, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vmovdqa %ymm11, (%rdx) ; AVX2-NEXT: vmovdqa %ymm3, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -216,50 +216,50 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa 32(%rdi), %xmm8 -; SSE-NEXT: movdqa 48(%rdi), %xmm9 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm0, (%rsi) -; SSE-NEXT: movapd %xmm2, (%rdx) -; SSE-NEXT: movapd %xmm6, (%rcx) -; SSE-NEXT: movapd %xmm3, (%r8) +; SSE-NEXT: movapd %xmm7, (%rdx) +; SSE-NEXT: movapd %xmm8, (%rcx) +; SSE-NEXT: movapd %xmm1, (%r8) ; SSE-NEXT: retq ; ; AVX1-LABEL: vf8: @@ -275,7 +275,7 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX1-NEXT: vpackusdw %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm5, %xmm0, %xmm8 +; AVX1-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,2,2,3] @@ -295,9 +295,9 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,0,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] @@ -305,9 +305,9 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm8, (%rsi) +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) ; AVX1-NEXT: vmovdqa %xmm5, (%rdx) -; AVX1-NEXT: vmovdqa %xmm0, (%rcx) +; AVX1-NEXT: vmovdqa %xmm6, (%rcx) ; AVX1-NEXT: vmovdqa %xmm1, (%r8) ; AVX1-NEXT: retq ; @@ -320,7 +320,7 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm0, %xmm8 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 @@ -344,9 +344,9 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] @@ -354,9 +354,9 @@ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm8, (%rsi) +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%r8) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -370,7 +370,7 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm8 +; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 @@ -392,9 +392,9 @@ ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] @@ -402,9 +402,9 @@ ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm8, (%rsi) +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsi) ; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %xmm6, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %xmm1, (%r8) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -444,200 +444,198 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 96(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa 16(%rdi), %xmm10 +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm13[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa 112(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] +; SSE-NEXT: movdqa 112(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[2,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm12[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,2,0,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm10[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm15[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] -; SSE-NEXT: movapd %xmm7, 16(%rsi) -; SSE-NEXT: movapd %xmm11, (%rsi) -; SSE-NEXT: movapd %xmm1, 16(%rdx) +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm14[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm13[0],xmm15[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE-NEXT: movapd %xmm4, 16(%rsi) +; SSE-NEXT: movapd %xmm7, (%rsi) +; SSE-NEXT: movapd %xmm8, 16(%rdx) ; SSE-NEXT: movapd %xmm5, (%rdx) -; SSE-NEXT: movapd %xmm6, 16(%rcx) -; SSE-NEXT: movapd %xmm3, (%rcx) -; SSE-NEXT: movapd %xmm2, 16(%r8) -; SSE-NEXT: movapd %xmm0, (%r8) +; SSE-NEXT: movapd %xmm15, 16(%rcx) +; SSE-NEXT: movapd %xmm6, (%rcx) +; SSE-NEXT: movapd %xmm3, 16(%r8) +; SSE-NEXT: movapd %xmm1, (%r8) ; SSE-NEXT: retq ; ; AVX1-LABEL: vf16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm2[1,2,3],xmm5[4],xmm2[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm1[1,2,3],xmm5[4],xmm1[5,6,7] ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm2[1,2,3],xmm6[4],xmm2[5,6,7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm1[1,2,3],xmm6[4],xmm1[5,6,7] +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm7 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm2[1,2,3],xmm7[4],xmm2[5,6,7] -; AVX1-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rdi), %xmm11 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0],xmm2[1,2,3],xmm13[4],xmm2[5,6,7] -; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm2[1,2,3],xmm12[4],xmm2[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1,2,3],xmm11[4],xmm2[5,6,7] -; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm9 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm10 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,2,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,2,0,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[2,0,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm1[1,2,3],xmm7[4],xmm1[5,6,7] +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm1[1,2,3],xmm8[4],xmm1[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0],xmm1[1,2,3],xmm9[4],xmm1[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7] +; AVX1-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX1-NEXT: vpackusdw %xmm11, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm10, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[0,1,2,0,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[2,0,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,1,2,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,1,2,0,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm14 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,2,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,2,0,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,0,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm13[4,5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,3,1,4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,3,1,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,3,1,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,1,3,1,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm9, (%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX1-NEXT: vmovdqa %xmm1, (%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) ; AVX1-NEXT: vmovaps %ymm10, (%rdx) ; AVX1-NEXT: vmovaps %ymm11, (%rcx) -; AVX1-NEXT: vmovaps %ymm0, (%r8) +; AVX1-NEXT: vmovaps %ymm2, (%r8) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -663,94 +661,92 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-SLOW-NEXT: vmovdqa %ymm13, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: vf16: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX2-FAST-ALL-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-ALL-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-FAST-ALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm3, %xmm4 @@ -760,68 +756,66 @@ ; AVX2-FAST-ALL-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-ALL-NEXT: vpermd %ymm12, %ymm3, %ymm4 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm4, %ymm6 -; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm3, %ymm7 +; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm3, %ymm4 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm5, %ymm4, %ymm6 +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm3, %ymm7 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> ; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm7, %ymm3 ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm3 ; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm5, %xmm2 -; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm4, %ymm3 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm7, %ymm4 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-ALL-NEXT: vpermd %ymm12, %ymm1, %ymm3 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm3, %ymm7 -; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm1, %ymm8 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,1,2,3] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[3,1,2,3] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, (%rdx) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, (%rcx) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm11, %xmm10, %xmm12 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm12, %xmm6, %xmm13 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm12, %xmm3, %xmm12 +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,5,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm5, %ymm2, %ymm5 +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm7, %ymm1 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm1, %ymm7 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,3,1,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, (%rdx) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, (%rcx) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-FAST-ALL-NEXT: vzeroupper ; AVX2-FAST-ALL-NEXT: retq ; @@ -847,80 +841,78 @@ ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -961,13 +953,13 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $280, %rsp # imm = 0x118 -; SSE-NEXT: movdqa 224(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: subq $296, %rsp # imm = 0x128 +; SSE-NEXT: movdqa 224(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm3 @@ -976,32 +968,31 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,1,0,2,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,0,2,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1009,63 +1000,63 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm15[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm1[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movdqa 160(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] @@ -1075,10 +1066,11 @@ ; SSE-NEXT: pshuflw $237, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1095,293 +1087,293 @@ ; SSE-NEXT: # xmm3 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,1,2,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,1,2,0,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,2,0,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,0,2,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm14[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm6[0],xmm10[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm14[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm5[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm4[0,1,2,0,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movapd %xmm7, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rdx) +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movapd %xmm12, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movapd %xmm15, 32(%rcx) -; SSE-NEXT: movapd %xmm10, (%rcx) +; SSE-NEXT: movapd %xmm9, (%rcx) ; SSE-NEXT: movapd %xmm2, 16(%rcx) -; SSE-NEXT: movapd %xmm11, 48(%rcx) -; SSE-NEXT: movapd %xmm5, 32(%r8) -; SSE-NEXT: movapd %xmm0, (%r8) +; SSE-NEXT: movapd %xmm10, 48(%rcx) +; SSE-NEXT: movapd %xmm4, 32(%r8) +; SSE-NEXT: movapd %xmm7, (%r8) ; SSE-NEXT: movapd %xmm1, 48(%r8) -; SSE-NEXT: movapd %xmm4, 16(%r8) -; SSE-NEXT: addq $280, %rsp # imm = 0x118 +; SSE-NEXT: movapd %xmm3, 16(%r8) +; SSE-NEXT: addq $296, %rsp # imm = 0x128 ; SSE-NEXT: retq ; ; AVX1-LABEL: vf32: ; AVX1: # %bb.0: ; AVX1-NEXT: subq $200, %rsp -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX1-NEXT: vmovdqa 240(%rdi), %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; AVX1-NEXT: vmovdqa %xmm1, %xmm12 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] +; AVX1-NEXT: vmovdqa %xmm1, %xmm9 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovdqa 224(%rdi), %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] -; AVX1-NEXT: vmovdqa %xmm2, %xmm14 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] +; AVX1-NEXT: vmovdqa %xmm2, %xmm10 ; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa 208(%rdi), %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] -; AVX1-NEXT: vmovdqa %xmm2, %xmm15 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] +; AVX1-NEXT: vmovdqa %xmm2, %xmm11 ; AVX1-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovdqa 176(%rdi), %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] ; AVX1-NEXT: vmovdqa 160(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] ; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa 144(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; AVX1-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] +; AVX1-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm6[1,2,3],xmm12[4],xmm6[5,6,7] +; AVX1-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm4[1,2,3],xmm8[4],xmm4[5,6,7] +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm6[1,2,3],xmm4[4],xmm6[5,6,7] ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm4[1,2,3],xmm5[4],xmm4[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm6[1,2,3],xmm5[4],xmm6[5,6,7] ; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa 80(%rdi), %xmm9 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm4[1,2,3],xmm9[4],xmm4[5,6,7] +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7] ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0],xmm6[1,2,3],xmm3[4],xmm6[5,6,7] +; AVX1-NEXT: vpackusdw %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm4[1,2,3],xmm10[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm7, %xmm13 -; AVX1-NEXT: vmovdqa (%rdi), %xmm11 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm6[1,2,3],xmm15[4],xmm6[5,6,7] +; AVX1-NEXT: vpackusdw %xmm1, %xmm7, %xmm7 +; AVX1-NEXT: vmovdqa (%rdi), %xmm14 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm4[1,2,3],xmm11[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpackusdw %xmm13, %xmm4, %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,2,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0],xmm6[1,2,3],xmm14[4],xmm6[5,6,7] +; AVX1-NEXT: vpackusdw %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpackusdw %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[0,2,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm14, %xmm12 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-NEXT: vmovdqa %xmm15, %xmm14 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX1-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[0,2,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX1-NEXT: vmovdqa %xmm10, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-NEXT: vmovdqa %xmm11, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-NEXT: # xmm6 = mem[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX1-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-NEXT: # xmm6 = mem[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-NEXT: # xmm7 = mem[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-NEXT: # xmm7 = mem[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,1,2,3] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] ; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,2,0,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[3,1,2,3] ; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,1,2,3] ; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,0,2,3,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,0,2,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,1,2,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,1,2,0,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,2,0,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,1,2,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,0,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[2,0,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-NEXT: # xmm10 = mem[3,1,2,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,2,0,4,5,6,7] +; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-NEXT: # xmm8 = mem[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,1,2,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[3,1,2,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[2,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[2,0,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[3,1,2,3] -; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-NEXT: # xmm15 = mem[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,2,0,4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,2,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-NEXT: # xmm12 = mem[3,1,2,3] +; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-NEXT: # xmm13 = mem[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-NEXT: # xmm3 = mem[3,1,2,3] -; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: vpshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: # xmm2 = mem[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,0,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX1-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1393,8 +1385,8 @@ ; AVX1-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,3,1,4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,3,1,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,1,3,1,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] @@ -1402,15 +1394,15 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,3,1,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,3,1,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,3,1,4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,3,1,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,3,1,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[0,1,3,1,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] @@ -1430,7 +1422,7 @@ ; AVX1-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-NEXT: vmovaps %ymm13, 32(%rcx) +; AVX1-NEXT: vmovaps %ymm14, 32(%rcx) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-NEXT: vmovaps %ymm1, 32(%r8) @@ -1441,7 +1433,7 @@ ; ; AVX2-SLOW-LABEL: vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $200, %rsp +; AVX2-SLOW-NEXT: subq $184, %rsp ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -1484,33 +1476,34 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] @@ -1518,20 +1511,18 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -1548,58 +1539,57 @@ ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -1633,19 +1623,19 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -1671,19 +1661,19 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: addq $200, %rsp +; AVX2-SLOW-NEXT: addq $184, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: vf32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: subq $200, %rsp -; AVX2-FAST-ALL-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX2-FAST-ALL-NEXT: subq $136, %rsp +; AVX2-FAST-ALL-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 @@ -1694,155 +1684,149 @@ ; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-ALL-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-ALL-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-ALL-NEXT: vpermd %ymm4, %ymm3, %ymm10 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-ALL-NEXT: vpshufb %ymm2, %ymm10, %ymm4 -; AVX2-FAST-ALL-NEXT: vpermd %ymm5, %ymm3, %ymm8 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpshufb %ymm9, %ymm8, %ymm5 -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, %ymm11 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] +; AVX2-FAST-ALL-NEXT: vpermd %ymm4, %ymm2, %ymm7 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm4, %ymm7, %ymm3 +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm2, %ymm12 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm12, %ymm4 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-ALL-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-ALL-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-ALL-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-ALL-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm3, %ymm15 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm2, %ymm15, %ymm1 -; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm3, %ymm9 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm9, %ymm4 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm2, %ymm15 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm15, %ymm1 +; AVX2-FAST-ALL-NEXT: vpermd %ymm5, %ymm2, %ymm4 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm4, %ymm2 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm6, %xmm2, %xmm1 -; AVX2-FAST-ALL-NEXT: vpshufb %xmm6, %xmm12, %xmm4 -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm3, %xmm1 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm9, %xmm2 +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm3, %xmm4 -; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm11, %xmm7 -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm10, %ymm5 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpshufb %ymm13, %ymm8, %ymm7 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm5, %xmm6 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm11, %xmm10 +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm7, %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, %ymm10 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm12, %ymm7 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX2-FAST-ALL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa 176(%rdi), %xmm4 -; AVX2-FAST-ALL-NEXT: vpshufb %xmm6, %xmm4, %xmm7 -; AVX2-FAST-ALL-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FAST-ALL-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX2-FAST-ALL-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa 144(%rdi), %xmm10 -; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqa 160(%rdi), %xmm13 +; AVX2-FAST-ALL-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm6, %xmm2 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm13, %xmm12 +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX2-FAST-ALL-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX2-FAST-ALL-NEXT: vmovdqa 144(%rdi), %xmm14 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm12, %xmm1 ; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] -; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm15, %ymm1 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm13, %ymm9, %ymm8 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm15, %ymm1 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm4, %ymm2 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpermd (%rsp), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-ALL-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm9 = ymm15[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[3,1,2,3] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm2, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[3,1,2,3] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,0,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-ALL-NEXT: vpshufb %ymm7, %ymm9, %ymm5 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-FAST-ALL-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-ALL-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] -; AVX2-FAST-ALL-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-FAST-ALL-NEXT: # xmm7 = mem[3,1,2,3] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[2,0,2,3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm7, %ymm11 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm15 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm0, %ymm5 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm9, %ymm3 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,1,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufb %ymm10, %ymm7, %ymm3 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[3,1,2,3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,3,1,4,5,6,7] ; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FAST-ALL-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 32(%r8) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-ALL-NEXT: addq $200, %rsp +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, (%r8) +; AVX2-FAST-ALL-NEXT: addq $136, %rsp ; AVX2-FAST-ALL-NEXT: vzeroupper ; AVX2-FAST-ALL-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $184, %rsp +; AVX2-FAST-PERLANE-NEXT: subq $168, %rsp ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -1884,161 +1868,159 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm9, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm15, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] @@ -2051,12 +2033,12 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: addq $184, %rsp +; AVX2-FAST-PERLANE-NEXT: addq $168, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -316,104 +316,102 @@ define void @vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm11 -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rdi), %xmm15 -; SSE-NEXT: movdqa 32(%rdi), %xmm8 -; SSE-NEXT: movdqa 48(%rdi), %xmm12 +; SSE-NEXT: movdqa 64(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: movdqa 48(%rdi), %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,0,3] ; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm8, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm10, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,1,0,1] -; SSE-NEXT: movaps %xmm10, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: por %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pand %xmm10, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,1,1,3] -; SSE-NEXT: psllq $48, %xmm11 -; SSE-NEXT: pandn %xmm11, %xmm10 -; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[0,2,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm8[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm15[2],xmm4[3],xmm15[3] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm15[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm14[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm3, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,0,1] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm8, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 ; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[3,0] +; SSE-NEXT: psrlq $48, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm7, %xmm9 ; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufhw $148, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,1,1,3] +; SSE-NEXT: psllq $48, %xmm5 +; SSE-NEXT: pandn %xmm5, %xmm3 ; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE-NEXT: movdqa %xmm9, (%rsi) -; SSE-NEXT: movdqa %xmm10, (%rdx) -; SSE-NEXT: movaps %xmm4, (%rcx) -; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movaps %xmm3, (%r9) +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: pandn %xmm6, %xmm13 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[3,0] +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,0] +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movdqa %xmm4, (%rsi) +; SSE-NEXT: movdqa %xmm3, (%rdx) +; SSE-NEXT: movaps %xmm12, (%rcx) +; SSE-NEXT: movaps %xmm14, (%r8) +; SSE-NEXT: movaps %xmm2, (%r9) ; SSE-NEXT: retq ; ; AVX1-LABEL: vf8: @@ -432,7 +430,7 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,5,6],xmm6[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm6[7] ; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7] @@ -441,25 +439,25 @@ ; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,0,1,10,11,4,5,14,15,u,u] ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4,5,6,7] ; AVX1-NEXT: vpsllq $48, %xmm5, %xmm7 -; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,5,6],xmm7[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,1,1,3] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3,4,5],xmm7[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,1,2,0] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm7[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,7,0,1,10,11,u,u,u,u,u,u,12,13,14,15] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,4,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4,5],xmm7[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,1,0,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,1,2,0] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,0,1,10,11,u,u,u,u,u,u,12,13,14,15] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4,5],xmm8[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,0,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] @@ -470,10 +468,10 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,1,3] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-NEXT: vmovdqa %xmm8, (%rsi) -; AVX1-NEXT: vmovdqa %xmm9, (%rdx) -; AVX1-NEXT: vmovdqa %xmm4, (%rcx) -; AVX1-NEXT: vmovdqa %xmm6, (%r8) +; AVX1-NEXT: vmovdqa %xmm4, (%rsi) +; AVX1-NEXT: vmovdqa %xmm6, (%rdx) +; AVX1-NEXT: vmovdqa %xmm7, (%rcx) +; AVX1-NEXT: vmovdqa %xmm8, (%r8) ; AVX1-NEXT: vmovdqa %xmm0, (%r9) ; AVX1-NEXT: retq ; @@ -655,348 +653,354 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movdqa 144(%rdi), %xmm12 +; SSE-NEXT: pushq %rax +; SSE-NEXT: movdqa 144(%rdi), %xmm13 ; SSE-NEXT: movdqa 80(%rdi), %xmm5 -; SSE-NEXT: movdqa 96(%rdi), %xmm10 -; SSE-NEXT: movdqa 128(%rdi), %xmm8 -; SSE-NEXT: movdqa 112(%rdi), %xmm14 -; SSE-NEXT: movdqa 64(%rdi), %xmm7 -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: movdqa 16(%rdi), %xmm9 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: movdqa 96(%rdi), %xmm3 +; SSE-NEXT: movdqa 128(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm9 +; SSE-NEXT: movdqa 64(%rdi), %xmm12 +; SSE-NEXT: movdqa (%rdi), %xmm7 +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa 32(%rdi), %xmm11 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] +; SSE-NEXT: movdqa %xmm4, %xmm14 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm13, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] +; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,0,1] +; SSE-NEXT: movdqa %xmm12, %xmm15 +; SSE-NEXT: movaps %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3] +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; SSE-NEXT: andps %xmm13, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,1] -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: andps %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,0,1] +; SSE-NEXT: movaps %xmm7, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: psllq $48, %xmm2 -; SSE-NEXT: movaps %xmm13, %xmm3 -; SSE-NEXT: andnps %xmm2, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: andnps %xmm2, %xmm13 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: orps %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3] +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm2[2,0] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm14[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm14[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,1,3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: andnps %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,0] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm4[3,0] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm11[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,3,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm11[3,0] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm9[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[0,1,0,3] +; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: pandn %xmm4, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm9[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm10[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm14[3,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm5[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm11 +; SSE-NEXT: por %xmm14, %xmm11 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[0,2] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm9[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[3,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,4,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm5[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm3[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,0] -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm3[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movdqa %xmm13, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm6[2,0] +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,0] +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movdqa %xmm7, 16(%rdx) +; SSE-NEXT: movaps %xmm13, (%rdx) ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps %xmm15, (%rcx) -; SSE-NEXT: movaps %xmm4, 16(%r8) -; SSE-NEXT: movaps %xmm2, (%r8) -; SSE-NEXT: movaps %xmm1, 16(%r9) -; SSE-NEXT: movaps %xmm7, (%r9) -; SSE-NEXT: addq $24, %rsp +; SSE-NEXT: movaps %xmm11, 16(%r8) +; SSE-NEXT: movaps %xmm1, (%r8) +; SSE-NEXT: movaps %xmm3, 16(%r9) +; SSE-NEXT: movaps %xmm8, (%r9) +; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: vf16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa 96(%rdi), %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,1,1,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; AVX1-NEXT: vmovdqa 112(%rdi), %xmm9 -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; AVX1-NEXT: vmovdqa 80(%rdi), %xmm13 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; AVX1-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,7] +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm2[1],xmm1[1] +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7] +; AVX1-NEXT: vmovdqa 144(%rdi), %xmm8 ; AVX1-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm0[2,3],xmm7[4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX1-NEXT: vmovdqa (%rdi), %xmm14 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm15 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX1-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,1,0,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX1-NEXT: vandps %ymm1, %ymm12, %ymm3 -; AVX1-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm11 = xmm1[0,1,0,1] -; AVX1-NEXT: vandnps %ymm11, %ymm12, %ymm11 -; AVX1-NEXT: vorps %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm10 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm0[4,5],xmm7[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] -; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm3 -; AVX1-NEXT: vandnps %ymm3, %ymm12, %ymm3 -; AVX1-NEXT: vpsrlq $48, %xmm15, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,1,10,11,4,5,14,15,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] -; AVX1-NEXT: vandps %ymm2, %ymm12, %ymm2 -; AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm11 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm8[4,5],xmm9[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,2,3,12,13,12,13,u,u,u,u] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,1,1,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5],xmm3[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,2,0] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm12 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm9[2,3],xmm8[4,5],xmm9[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,1,10,11,4,5,14,15,u,u,u,u,u,u] -; AVX1-NEXT: vpsrlq $48, %xmm13, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm7[4,5],xmm0[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm14[2,3],xmm15[4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm6[4,5],xmm5[6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5],xmm3[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3],xmm3[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1,2,3],xmm14[4,5],xmm15[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5],xmm4[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,1,0,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm5[4],xmm9[5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7] +; AVX1-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-NEXT: vandps %ymm11, %ymm9, %ymm12 +; AVX1-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX1-NEXT: vpermilps {{.*#+}} xmm13 = xmm9[0,1,0,1] +; AVX1-NEXT: vandnps %ymm13, %ymm11, %ymm13 +; AVX1-NEXT: vorps %ymm13, %ymm12, %ymm12 +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] +; AVX1-NEXT: vpsllq $48, %xmm9, %xmm13 +; AVX1-NEXT: vandnps %ymm13, %ymm11, %ymm13 +; AVX1-NEXT: vpsrlq $48, %xmm4, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,0,1,10,11,4,5,14,15,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7] +; AVX1-NEXT: vandps %ymm11, %ymm14, %ymm11 +; AVX1-NEXT: vorps %ymm13, %ymm11, %ymm11 +; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,8,9,2,3,12,13,12,13,u,u,u,u] +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,1,1,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u] +; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5],xmm13[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,2,0] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 +; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,u,u,u,u,u,u] +; AVX1-NEXT: vpsrlq $48, %xmm2, %xmm14 +; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1,2,3],xmm7[4,5],xmm8[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] +; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1,2,3],xmm6[4,5],xmm5[6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] +; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,1,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,1,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovaps %ymm10, (%rsi) ; AVX1-NEXT: vmovaps %ymm11, (%rdx) ; AVX1-NEXT: vmovaps %ymm12, (%rcx) -; AVX1-NEXT: vmovaps %ymm2, (%r8) +; AVX1-NEXT: vmovaps %ymm13, (%r8) ; AVX1-NEXT: vmovaps %ymm0, (%r9) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1015,57 +1019,57 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5,6],xmm5[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0,1],xmm6[2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm4[2],xmm6[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0],xmm6[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] @@ -1082,10 +1086,10 @@ ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rsi) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, (%rdx) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -1109,55 +1113,55 @@ ; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm6, %ymm6 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm6, %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] -; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm6, %xmm5 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6],xmm5[7] -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6],xmm9[7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,u,u,4,7,1,6> -; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm10, %ymm6 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm6, %ymm6 +; AVX2-FAST-ALL-NEXT: vpermd %ymm9, %ymm10, %ymm9 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm6, %ymm9, %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm8, %ymm9 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <0,2,u,u,5,7,2,4> -; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,4,6,0,1,4,6,0] -; AVX2-FAST-ALL-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm7, %ymm7 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm7, %ymm7 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] -; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3] -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <0,3,u,u,5,0,2,7> -; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm10, %ymm6 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0] -; AVX2-FAST-ALL-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm6, %ymm6 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm9, %ymm9 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = <0,2,u,u,5,7,2,4> +; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0] +; AVX2-FAST-ALL-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm10, %ymm10 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm7, %ymm10, %ymm7 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7> +; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] +; AVX2-FAST-ALL-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm10, %ymm10 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm10, %ymm8 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] ; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] @@ -1171,10 +1175,10 @@ ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, (%rsi) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm12, (%rdx) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm6, (%rdx) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, (%r8) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm8, (%r8) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-FAST-ALL-NEXT: vzeroupper ; AVX2-FAST-ALL-NEXT: retq @@ -1193,57 +1197,57 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm5[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm4[0,1],xmm5[2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] @@ -1258,10 +1262,10 @@ ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -1318,17 +1322,18 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $408, %rsp # imm = 0x198 -; SSE-NEXT: movdqa 304(%rdi), %xmm9 +; SSE-NEXT: subq $424, %rsp # imm = 0x1A8 +; SSE-NEXT: movdqa 304(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rdi), %xmm8 ; SSE-NEXT: movdqa 256(%rdi), %xmm12 -; SSE-NEXT: movdqa 288(%rdi), %xmm14 -; SSE-NEXT: movdqa 272(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm13 -; SSE-NEXT: movdqa 96(%rdi), %xmm10 +; SSE-NEXT: movdqa 288(%rdi), %xmm6 +; SSE-NEXT: movdqa 272(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm5 +; SSE-NEXT: movdqa 96(%rdi), %xmm9 ; SSE-NEXT: movdqa 128(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm2 @@ -1339,28 +1344,29 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm7, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] -; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm10, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; SSE-NEXT: movaps %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3] +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,1,0,3] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] @@ -1368,12 +1374,12 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] -; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; SSE-NEXT: movaps %xmm10, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 ; SSE-NEXT: movdqa 32(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: andps %xmm7, %xmm3 +; SSE-NEXT: andps %xmm10, %xmm3 ; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 @@ -1383,35 +1389,37 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movaps %xmm10, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm7, %xmm3 +; SSE-NEXT: andps %xmm10, %xmm3 ; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: movdqa 208(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: movdqa %xmm2, %xmm14 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa 176(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE-NEXT: movdqa 176(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: movdqa 160(%rdi), %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] @@ -1422,22 +1430,22 @@ ; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm10, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: andps %xmm7, %xmm2 +; SSE-NEXT: andps %xmm10, %xmm2 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,3,2,3] +; SSE-NEXT: psrlq $48, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -1445,24 +1453,22 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: psllq $48, %xmm2 -; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm10, %xmm3 ; SSE-NEXT: andnps %xmm2, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: orps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,3,2,3] ; SSE-NEXT: pshufd $232, (%rsp), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -1472,22 +1478,23 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,3,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -1497,24 +1504,24 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -1523,111 +1530,112 @@ ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps (%rsp), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm15[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm15[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,0,1,3] -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: andnps %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,1,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0,1,3] -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: andnps %xmm5, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,1,3] +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: andnps %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm7[3,0] -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: andnps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm13[3,0] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: andnps %xmm13, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,7,4,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm14[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] @@ -1635,22 +1643,22 @@ ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm8 ; SSE-NEXT: por %xmm1, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[3,0] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm15, %xmm11 +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: andnps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -1658,120 +1666,121 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2] -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm12[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[0,2] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm13[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm15, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm0[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pand %xmm9, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm5[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm7[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm13[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm7[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm4[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,1,0,3] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm13 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm6, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm6[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm3[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,0] -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,2,2,3] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm6[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[2,0] -; SSE-NEXT: por %xmm5, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,1,3] +; SSE-NEXT: por %xmm13, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1796,26 +1805,26 @@ ; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps %xmm3, 32(%r8) -; SSE-NEXT: movaps %xmm15, (%r8) +; SSE-NEXT: movaps %xmm15, 32(%r8) +; SSE-NEXT: movaps %xmm5, (%r8) ; SSE-NEXT: movaps %xmm8, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) -; SSE-NEXT: movaps %xmm10, (%r9) +; SSE-NEXT: movaps %xmm9, (%r9) ; SSE-NEXT: movaps %xmm12, 32(%r9) ; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps %xmm14, 16(%r9) -; SSE-NEXT: addq $408, %rsp # imm = 0x198 +; SSE-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSE-NEXT: retq ; ; AVX1-LABEL: vf32: ; AVX1: # %bb.0: ; AVX1-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX1-NEXT: vmovdqa 304(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm2, %xmm13 +; AVX1-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa 256(%rdi), %xmm2 @@ -1847,263 +1856,265 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX1-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-NEXT: vandnps %ymm4, %ymm12, %ymm4 +; AVX1-NEXT: vandnps %ymm4, %ymm5, %ymm4 +; AVX1-NEXT: vmovaps %ymm5, %ymm9 ; AVX1-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa 128(%rdi), %xmm9 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3],xmm9[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX1-NEXT: vmovdqa 128(%rdi), %xmm15 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1],xmm12[2,3],xmm15[4,5,6,7] ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; AVX1-NEXT: vmovdqa %xmm0, %xmm14 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; AVX1-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,1,1,3] ; AVX1-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa 80(%rdi), %xmm11 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; AVX1-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm5 +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] +; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0,1,2,3,4],xmm1[5,6,7] ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] -; AVX1-NEXT: vmovdqa %xmm2, %xmm10 +; AVX1-NEXT: vmovdqa %xmm2, %xmm7 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,1,2,3,4,5,6,7] ; AVX1-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] -; AVX1-NEXT: vmovdqa %xmm1, %xmm5 +; AVX1-NEXT: vmovdqa %xmm1, %xmm13 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,1,0,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2,3],xmm2[4],xmm15[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4,5,6,7] -; AVX1-NEXT: vandps %ymm4, %ymm12, %ymm7 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,1,0,3] +; AVX1-NEXT: vmovdqa %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm2[4],xmm11[5,6,7] +; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm11[4,5,6,7] +; AVX1-NEXT: vmovaps %ymm9, %ymm0 +; AVX1-NEXT: vandps %ymm4, %ymm9, %ymm11 ; AVX1-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm15 = xmm1[0,1,0,1] -; AVX1-NEXT: vmovaps %xmm1, %xmm4 -; AVX1-NEXT: vandnps %ymm15, %ymm12, %ymm15 -; AVX1-NEXT: vorps %ymm7, %ymm15, %ymm7 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm9 = xmm1[0,1,0,1] +; AVX1-NEXT: vmovaps %xmm1, %xmm14 +; AVX1-NEXT: vandnps %ymm9, %ymm0, %ymm9 +; AVX1-NEXT: vorps %ymm9, %ymm11, %ymm9 +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa %xmm14, %xmm15 -; AVX1-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm14[2,3],xmm8[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm8[2,3],xmm5[4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3,4,5,6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm3[4,5],xmm9[6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm7 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlq $48, %xmm10, %xmm14 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm6[2,3],xmm2[4,5],xmm6[6,7] -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] -; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3,4,5,6,7] -; AVX1-NEXT: vandps %ymm7, %ymm12, %ymm7 -; AVX1-NEXT: vpsllq $48, %xmm4, %xmm14 -; AVX1-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vandnps %ymm14, %ymm12, %ymm14 -; AVX1-NEXT: vorps %ymm7, %ymm14, %ymm7 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,2,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3,4,5,6,7] +; AVX1-NEXT: vmovdqa %xmm12, %xmm11 +; AVX1-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm15, %xmm4 +; AVX1-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2,3],xmm12[4,5],xmm15[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX1-NEXT: vpshufb %xmm5, %xmm9, %xmm9 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm9[5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,2,2,3,4,5,6,7] +; AVX1-NEXT: vpsrlq $48, %xmm7, %xmm8 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] +; AVX1-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4,5,6,7] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-NEXT: vandps %ymm3, %ymm8, %ymm8 +; AVX1-NEXT: vpsllq $48, %xmm14, %xmm9 +; AVX1-NEXT: vmovdqa %xmm14, %xmm2 +; AVX1-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vandnps %ymm9, %ymm3, %ymm9 +; AVX1-NEXT: vorps %ymm9, %ymm8, %ymm8 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa %xmm13, %xmm14 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm13[4,5],xmm5[6,7] -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm14[4,5],xmm7[6,7] +; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload +; AVX1-NEXT: # xmm8 = mem[0,1],xmm15[2,3],mem[4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,2,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3,4],xmm0[5,6,7] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1],xmm12[2,3],xmm13[4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4],xmm0[5,6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX1-NEXT: # xmm4 = xmm1[0,1],mem[2,3],xmm1[4,5],mem[6,7] -; AVX1-NEXT: vpshufb %xmm9, %xmm4, %xmm1 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1],xmm13[2,3],xmm12[4,5],xmm13[6,7] +; AVX1-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-NEXT: vpsrlq $48, %xmm10, %xmm7 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5,6,7] -; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX1-NEXT: vandps %ymm7, %ymm1, %ymm1 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpsllq $48, %xmm4, %xmm4 -; AVX1-NEXT: vandnps %ymm4, %ymm7, %ymm4 -; AVX1-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vpsrlq $48, %xmm10, %xmm9 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3,4,5,6,7] +; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-NEXT: vandps %ymm1, %ymm9, %ymm1 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-NEXT: vpsllq $48, %xmm5, %xmm8 +; AVX1-NEXT: vandnps %ymm8, %ymm9, %ymm5 +; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[0,1,2,3],xmm15[4,5],mem[6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-NEXT: # xmm4 = mem[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3,4,5,6,7] -; AVX1-NEXT: vpblendw $12, (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-NEXT: # xmm4 = xmm3[0,1],mem[2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm6[2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: # xmm1 = mem[0,1,1,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; AVX1-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5],xmm1[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,2,0] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5],mem[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-NEXT: # xmm5 = mem[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm4[2,3],xmm11[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX1-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload +; AVX1-NEXT: # xmm5 = xmm4[0,1],mem[2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX1-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-NEXT: # xmm11 = mem[0,1,1,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7] +; AVX1-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3,4,5],xmm11[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,1,2,0] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm11[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa %xmm5, %xmm15 -; AVX1-NEXT: vmovdqa %xmm14, %xmm8 -; AVX1-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm5[2,3],xmm14[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm14 -; AVX1-NEXT: vmovdqa %xmm12, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3],xmm12[4,5],xmm13[6,7] -; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa %xmm11, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3,4],xmm14[5,6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm11[2,3],xmm7[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,1,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5],xmm4[6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,2,0] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 +; AVX1-NEXT: vmovdqa %xmm7, %xmm11 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm7[2,3],xmm14[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3],xmm15[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,1,1,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; AVX1-NEXT: vmovdqa %xmm10, %xmm9 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3,4,5],xmm5[6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[0,1,2,0] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm13[2,3],xmm5[4,5],xmm13[6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlq $48, %xmm3, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm15[4,5],xmm8[6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm10, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm11[4,5],xmm7[6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5],xmm4[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm0[2,3],xmm15[4,5],xmm0[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm0 +; AVX1-NEXT: vpsrlq $48, %xmm3, %xmm5 +; AVX1-NEXT: vmovdqa %xmm3, %xmm15 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2,3],xmm11[4,5],xmm14[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX1-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm6[2,3],xmm9[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,4,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3,4,5],xmm5[6,7] +; AVX1-NEXT: vmovdqa %xmm10, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,1,0,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm11[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7] -; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-NEXT: vpsrlq $48, %xmm12, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm2[4,5],xmm4[6,7] +; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm0[2,3],xmm8[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm8[2,3],xmm3[4,5],xmm8[6,7] +; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vpsrlq $48, %xmm11, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1,2,3],xmm13[4,5],xmm11[6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5],xmm1[6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1],xmm13[2,3],xmm14[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,1,0,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm9 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm10[4,5],xmm12[6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,4,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5],xmm1[6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,6] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3],xmm4[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[2,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm0[4,5],xmm8[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,3,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3],xmm5[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1,2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2,3],xmm13[4,5],xmm14[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4,5],xmm4[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] +; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3,4,5],xmm5[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,1,3] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: vpshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: # xmm2 = mem[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3] +; AVX1-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: # xmm4 = mem[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX1-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-NEXT: # xmm4 = mem[0,3,2,3] ; AVX1-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-NEXT: # xmm4 = xmm4[0,1,2],mem[3],xmm4[4,5,6,7] -; AVX1-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[2,3,2,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload -; AVX1-NEXT: # xmm4 = xmm6[0,1,2,3],mem[4,5],xmm6[6,7] +; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: # xmm4 = mem[0,1,2,3],xmm6[4,5],mem[6,7] ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-NEXT: # xmm4 = mem[1,1,1,1] @@ -2112,7 +2123,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5],xmm3[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,1,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,1,1,3] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 @@ -2120,17 +2131,17 @@ ; AVX1-NEXT: vmovaps %ymm3, (%rsi) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm3, 32(%rdx) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-NEXT: vmovaps %ymm9, (%r8) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm3, 32(%r8) +; AVX1-NEXT: vmovaps %ymm0, (%r8) ; AVX1-NEXT: vmovaps %ymm2, 32(%r9) ; AVX1-NEXT: vmovaps %ymm1, (%r9) ; AVX1-NEXT: addq $424, %rsp # imm = 0x1A8 @@ -2140,408 +2151,414 @@ ; AVX2-SLOW-LABEL: vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $280, %rsp # imm = 0x118 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm3[1],ymm10[2,3],ymm3[4],ymm10[5],ymm3[6],ymm10[7,8],ymm3[9],ymm10[10,11],ymm3[12],ymm10[13],ymm3[14],ymm10[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm9[1,2],ymm15[3],ymm9[4],ymm15[5],ymm9[6,7],ymm15[8],ymm9[9,10],ymm15[11],ymm9[12],ymm15[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm11 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm2, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10],ymm13[11],ymm14[12,13],ymm13[14],ymm14[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5],ymm15[6],ymm11[7,8],ymm15[9],ymm11[10,11],ymm15[12],ymm11[13],ymm15[14],ymm11[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm14 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0],xmm7[1],xmm8[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5,6,7],ymm9[8,9,10,11,12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],xmm7[2],xmm8[3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2,3],xmm9[4,5],xmm10[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm3[1,2],ymm8[3],ymm3[4],ymm8[5],ymm3[6,7],ymm8[8],ymm3[9,10],ymm8[11],ymm3[12],ymm8[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6],ymm13[7] +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1,2,3],xmm11[4,5],xmm13[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm11, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm13 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm13, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10,11],ymm8[12],ymm3[13],ymm8[14],ymm3[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3],xmm0[4,5,6],xmm13[7] +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm13, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm3[0],xmm15[1],xmm3[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0,1,2,3,4],ymm11[5,6,7],ymm9[8,9,10,11,12],ymm11[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0],xmm13[1],xmm7[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm15[2],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm12 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1,2,3,4],ymm5[5,6,7],ymm12[8,9,10,11,12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5,6,7],ymm14[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm13[2],xmm7[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm2[2],ymm12[3],ymm2[4],ymm12[5,6],ymm2[7],ymm12[8,9],ymm2[10],ymm12[11],ymm2[12],ymm12[13,14],ymm2[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0],xmm8[1],xmm7[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm5[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0,1],ymm13[2],ymm14[3],ymm13[4],ymm14[5,6],ymm13[7],ymm14[8,9],ymm13[10],ymm14[11],ymm13[12],ymm14[13,14],ymm13[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3,4],xmm6[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0,1],ymm8[2],ymm5[3],ymm8[4],ymm5[5,6],ymm8[7],ymm5[8,9],ymm8[10],ymm5[11],ymm8[12],ymm5[13,14],ymm8[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0],xmm3[1],xmm15[2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm12[1,2],ymm2[3],ymm12[4],ymm2[5],ymm12[6,7],ymm2[8],ymm12[9,10],ymm2[11],ymm12[12],ymm2[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm8[2],xmm7[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0],ymm14[1,2],ymm13[3],ymm14[4],ymm13[5],ymm14[6,7],ymm13[8],ymm14[9,10],ymm13[11],ymm14[12],ymm13[13],ymm14[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %xmm9, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm9[2],xmm10[3] -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6],ymm9[7] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm15[2],ymm12[3],ymm15[4],ymm12[5,6],ymm15[7],ymm12[8,9],ymm15[10],ymm12[11],ymm15[12],ymm12[13,14],ymm15[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1],ymm6[2],ymm1[3],ymm6[4],ymm1[5,6],ymm6[7],ymm1[8,9],ymm6[10],ymm1[11],ymm6[12],ymm1[13,14],ymm6[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm7[1],xmm13[2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7,8],ymm2[9],ymm11[10],ymm2[11],ymm11[12,13],ymm2[14],ymm11[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4],ymm0[5],ymm9[6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm12[2],xmm15[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] +; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm1[1,2],ymm6[3],ymm1[4],ymm6[5],ymm1[6,7],ymm6[8],ymm1[9,10],ymm6[11],ymm1[12],ymm6[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2],xmm12[3] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm12[2],xmm13[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5,6,7],ymm9[8,9,10,11,12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm7[0,1],mem[2],ymm7[3],mem[4],ymm7[5,6],mem[7],ymm7[8,9],mem[10],ymm7[11],mem[12],ymm7[13,14],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-SLOW-NEXT: addq $280, %rsp # imm = 0x118 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: vf32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: subq $200, %rsp -; AVX2-FAST-ALL-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-FAST-ALL-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-FAST-ALL-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-FAST-ALL-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-ALL-NEXT: subq $264, %rsp # imm = 0x108 +; AVX2-FAST-ALL-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-FAST-ALL-NEXT: vmovdqa 256(%rdi), %ymm1 +; AVX2-FAST-ALL-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX2-FAST-ALL-NEXT: vmovdqa 160(%rdi), %ymm14 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX2-FAST-ALL-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4],ymm5[5],ymm7[6,7],ymm5[8],ymm7[9,10],ymm5[11],ymm7[12],ymm5[13],ymm7[14,15] +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3] ; AVX2-FAST-ALL-NEXT: vpermd %ymm8, %ymm10, %ymm8 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] ; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm8, %ymm9 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5],ymm4[6],ymm0[7,8],ymm4[9],ymm0[10,11],ymm4[12],ymm0[13],ymm4[14],ymm0[15] -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm8, %xmm0 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1,2,3],xmm8[4,5],xmm0[6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm0, %ymm9, %ymm9 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm14[1],ymm3[2,3],ymm14[4],ymm3[5],ymm14[6],ymm3[7,8],ymm14[9],ymm3[10,11],ymm14[12],ymm3[13],ymm14[14],ymm3[15] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm2[1,2],ymm12[3],ymm2[4],ymm12[5],ymm2[6,7],ymm12[8],ymm2[9,10],ymm12[11],ymm2[12],ymm12[13],ymm2[14,15] -; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm10, %ymm1 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm10 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,u,u,u,4,7,1,6> -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] -; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13],ymm4[14],ymm2[15] ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm13, %xmm1 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3],xmm13[4,5,6],xmm1[7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm13 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm3[1],ymm14[2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10],ymm3[11],ymm14[12,13],ymm3[14],ymm14[15] -; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm1, %xmm15 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3],xmm1[4,5,6],xmm15[7] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5],ymm12[6],ymm2[7,8],ymm12[9],ymm2[10,11],ymm12[12],ymm2[13],ymm12[14],ymm2[15] -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm12, %ymm4 -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,u,u,4,7,1,6> -; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm2, %ymm15 +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1,2,3],xmm8[4,5],xmm12[6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm12, %xmm8, %xmm13 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm13, %ymm9, %ymm9 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15] +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm14, %ymm4 +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm12, %xmm13, %xmm12 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, %ymm2 +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm10, %ymm10 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm12, %ymm10, %ymm10 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,u,u,4,7,1,6> +; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm12, %ymm11 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] +; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm11, %ymm11 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm15[1],ymm5[2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7,8],ymm15[9],ymm5[10],ymm15[11],ymm5[12,13],ymm15[14],ymm5[15] +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm15, %ymm6 +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3],xmm13[4,5,6],xmm15[7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm15, %xmm13, %xmm13 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm13, %ymm11, %ymm0 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm15, %xmm11, %xmm11 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-FAST-ALL-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm15, %ymm11 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,1,3,0,3,5,7] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm1, %ymm11, %ymm1 -; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm2, %ymm8 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm11, %ymm12, %ymm12 +; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm14, %ymm8 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm15 = ; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm8, %ymm8 ; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm2, %ymm2 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5,6,7],ymm10[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,2,3,1,3,6,7] -; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm2, %ymm9 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm9, %ymm9 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm14, %ymm8 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm8, %ymm8 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7],ymm10[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,2,3,1,3,6,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm9, %ymm10 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm1, %ymm10, %ymm10 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm9, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm8[2],ymm5[3],ymm8[4],ymm5[5,6],ymm8[7],ymm5[8,9],ymm8[10],ymm5[11],ymm8[12],ymm5[13,14],ymm8[15] +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15] +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, %ymm6 ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3,4],xmm0[5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5],ymm7[6],ymm3[7,8],ymm7[9],ymm3[10,11],ymm7[12],ymm3[13],ymm7[14],ymm3[15] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = <0,2,u,u,5,7,2,4> +; AVX2-FAST-ALL-NEXT: vpermd %ymm9, %ymm14, %ymm9 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm2, %ymm9, %ymm9 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,4,6,0,1,4,6,0] +; AVX2-FAST-ALL-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm9, %ymm10 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm10, %ymm10 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1],ymm0[2],ymm4[3],ymm0[4],ymm4[5,6],ymm0[7],ymm4[8,9],ymm0[10],ymm4[11],ymm0[12],ymm4[13,14],ymm0[15] +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3,4],xmm10[5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5],ymm12[6],ymm4[7,8],ymm12[9],ymm4[10,11],ymm12[12],ymm4[13],ymm12[14],ymm4[15] +; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm14, %ymm10 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm9, %ymm2 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4],ymm8[5],ymm6[6,7],ymm8[8],ymm6[9,10],ymm8[11],ymm6[12],ymm8[13],ymm6[14,15] +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm8, %ymm15 ; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = <0,2,u,u,5,7,2,4> -; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm9, %ymm2 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8],ymm3[9],ymm7[10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <0,3,u,u,5,0,2,7> +; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm10, %ymm2 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,4,6,0,1,4,6,0] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0] ; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm2, %ymm10 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm10, %ymm10 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm2, %ymm9 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2,3,4],ymm9[5,6,7],ymm1[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm3[2],ymm14[3],ymm3[4],ymm14[5,6],ymm3[7],ymm14[8,9],ymm3[10],ymm14[11],ymm3[12],ymm14[13,14],ymm3[15] -; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm10, %xmm1 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3,4],xmm10[5,6,7] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5],ymm12[6],ymm4[7,8],ymm12[9],ymm4[10,11],ymm12[12],ymm4[13],ymm12[14],ymm4[15] -; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm2, %ymm1 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] -; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <0,3,u,u,5,0,2,7> +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15] +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7,8],ymm4[9],ymm12[10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, %ymm14 ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm10, %ymm1 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm2 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,4,7,0,2,4,7,0] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm1, %ymm9 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-ALL-NEXT: vpshufb %ymm5, %ymm9, %ymm9 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm14[1,2],ymm3[3],ymm14[4],ymm3[5],ymm14[6,7],ymm3[8],ymm14[9,10],ymm3[11],ymm14[12],ymm3[13],ymm14[14,15] -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, %ymm9 -; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2],xmm0[3] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, %ymm15 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7,8],ymm4[9],ymm12[10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7],ymm7[8,9],ymm6[10],ymm7[11],ymm6[12],ymm7[13,14],ymm6[15] -; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm3[2],ymm7[3],ymm3[4],ymm7[5,6],ymm3[7],ymm7[8,9],ymm3[10],ymm7[11],ymm3[12],ymm7[13,14],ymm3[15] +; AVX2-FAST-ALL-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15] ; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5> ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7] -; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm1, %ymm5 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-ALL-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5],ymm9[6],ymm14[7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13],ymm9[14],ymm14[15] -; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4],xmm7[5,6,7] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm15[2],ymm12[3],ymm15[4],ymm12[5,6],ymm15[7],ymm12[8,9],ymm15[10],ymm12[11],ymm15[12],ymm12[13,14],ymm15[15] +; AVX2-FAST-ALL-NEXT: vpermd %ymm13, %ymm1, %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-ALL-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-ALL-NEXT: # ymm3 = ymm9[0],mem[1],ymm9[2,3],mem[4],ymm9[5],mem[6],ymm9[7,8],mem[9],ymm9[10,11],mem[12],ymm9[13],mem[14],ymm9[15] +; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm14[2],ymm12[3],ymm14[4],ymm12[5,6],ymm14[7],ymm12[8,9],ymm14[10],ymm12[11],ymm14[12],ymm12[13,14],ymm14[15] ; AVX2-FAST-ALL-NEXT: vpermd %ymm5, %ymm2, %ymm2 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5],ymm4[6,7] +; AVX2-FAST-ALL-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5],ymm3[6,7] ; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rsi) @@ -2551,7 +2568,7 @@ ; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-ALL-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rcx) @@ -2560,207 +2577,208 @@ ; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%r8) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-FAST-ALL-NEXT: addq $200, %rsp +; AVX2-FAST-ALL-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-ALL-NEXT: vzeroupper ; AVX2-FAST-ALL-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $296, %rsp # imm = 0x128 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: subq $264, %rsp # imm = 0x108 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4],ymm1[5],ymm4[6,7],ymm1[8],ymm4[9,10],ymm1[11],ymm4[12],ymm1[13],ymm4[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5],ymm2[6],ymm7[7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13],ymm2[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4,5],xmm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm12, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm6[1,2],ymm11[3],ymm6[4],ymm11[5],ymm6[6,7],ymm11[8],ymm6[9,10],ymm11[11],ymm6[12],ymm11[13],ymm6[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5],ymm8[6],ymm15[7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13],ymm8[14],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm1[1,2,3],xmm12[4,5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5],ymm10[6],ymm5[7,8],ymm10[9],ymm5[10,11],ymm10[12],ymm5[13],ymm10[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm11[1],ymm4[2],ymm11[3],ymm4[4,5],ymm11[6],ymm4[7,8],ymm11[9],ymm4[10],ymm11[11],ymm4[12,13],ymm11[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm1, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm15[1],ymm8[2],ymm15[3],ymm8[4,5],ymm15[6],ymm8[7,8],ymm15[9],ymm8[10],ymm15[11],ymm8[12,13],ymm15[14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm4, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm10[1],xmm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm12, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm10, %ymm9, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7,8],ymm2[9],ymm14[10],ymm2[11],ymm14[12,13],ymm2[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm13, %ymm9, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5],ymm11[6],ymm15[7,8],ymm11[9],ymm15[10,11],ymm11[12],ymm15[13],ymm11[14],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm9[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm9, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm14, %ymm15, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0],xmm9[1],xmm10[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7],ymm6[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm1[5,6,7],ymm7[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm12[0,1],xmm10[2],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7],ymm13[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0,1],xmm8[2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5],ymm0[6],ymm13[7,8],ymm0[9],ymm13[10,11],ymm0[12],ymm13[13],ymm0[14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5,6,7],ymm12[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm9[2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7],ymm14[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1],ymm14[2],ymm1[3],ymm14[4],ymm1[5,6],ymm14[7],ymm1[8,9],ymm14[10],ymm1[11],ymm14[12],ymm1[13,14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0],xmm12[1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5],ymm3[6],ymm7[7,8],ymm3[9],ymm7[10,11],ymm3[12],ymm7[13],ymm3[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1],ymm15[2],ymm7[3],ymm15[4],ymm7[5,6],ymm15[7],ymm7[8,9],ymm15[10],ymm7[11],ymm15[12],ymm7[13,14],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0],xmm9[1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10],ymm13[11],ymm0[12,13],ymm13[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm1[1,2],ymm14[3],ymm1[4],ymm14[5],ymm1[6,7],ymm14[8],ymm1[9,10],ymm14[11],ymm1[12],ymm14[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm12[2],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm7[1,2],ymm15[3],ymm7[4],ymm15[5],ymm7[6,7],ymm15[8],ymm7[9,10],ymm15[11],ymm7[12],ymm15[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],xmm9[2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4],xmm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0],xmm10[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2,3,4],ymm12[5,6,7],ymm11[8,9,10,11,12],ymm12[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13],ymm8[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm12[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1],ymm5[2],mem[3],ymm5[4],mem[5,6],ymm5[7],mem[8,9],ymm5[10],mem[11],ymm5[12],mem[13,14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4],xmm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2,3,4],ymm12[5,6,7],ymm11[8,9,10,11,12],ymm12[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4],ymm7[5],ymm3[6,7],ymm7[8],ymm3[9,10],ymm7[11],ymm3[12],ymm7[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0,1],xmm10[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2,3,4],ymm12[5,6,7],ymm11[8,9,10,11,12],ymm12[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm12[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm14, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1],ymm6[2],ymm8[3],ymm6[4],ymm8[5,6],ymm6[7],ymm8[8,9],ymm6[10],ymm8[11],ymm6[12],ymm8[13,14],ymm6[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm14[1],ymm7[2,3],ymm14[4],ymm7[5],ymm14[6],ymm7[7,8],ymm14[9],ymm7[10,11],ymm14[12],ymm7[13],ymm14[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7],mem[8,9],ymm1[10],mem[11],ymm1[12],mem[13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5],ymm2[6],mem[7,8],ymm2[9],mem[10,11],ymm2[12],mem[13],ymm2[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] @@ -2776,12 +2794,11 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-FAST-PERLANE-NEXT: addq $296, %rsp # imm = 0x128 +; AVX2-FAST-PERLANE-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -172,52 +172,52 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: psrld $16, %xmm7 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,3,2,3] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,3,2,3] ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq %xmm2, (%rsi) +; SSE-NEXT: movq %xmm3, (%rsi) ; SSE-NEXT: movq %xmm4, (%rdx) -; SSE-NEXT: movq %xmm7, (%rcx) -; SSE-NEXT: movq %xmm6, (%r8) -; SSE-NEXT: movq %xmm5, (%r9) +; SSE-NEXT: movq %xmm9, (%rcx) +; SSE-NEXT: movq %xmm2, (%r8) +; SSE-NEXT: movq %xmm6, (%r9) ; SSE-NEXT: movq %xmm0, (%rax) ; SSE-NEXT: retq ; @@ -232,7 +232,7 @@ ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,2,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm5 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] @@ -245,21 +245,21 @@ ; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,2,3,14,15,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm1 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[1,3,2,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq %xmm8, (%rsi) +; AVX1-NEXT: vmovq %xmm4, (%rsi) ; AVX1-NEXT: vmovq %xmm3, (%rdx) ; AVX1-NEXT: vmovq %xmm5, (%rcx) ; AVX1-NEXT: vmovq %xmm6, (%r8) -; AVX1-NEXT: vmovq %xmm4, (%r9) +; AVX1-NEXT: vmovq %xmm7, (%r9) ; AVX1-NEXT: vmovq %xmm0, (%rax) ; AVX1-NEXT: retq ; @@ -370,124 +370,124 @@ ; SSE-LABEL: vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 64(%rdi), %xmm6 -; SSE-NEXT: movdqa 80(%rdi), %xmm10 -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa 48(%rdi), %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,1,1,2,4,5,6,7] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[3,0] -; SSE-NEXT: movaps %xmm6, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[2,3] -; SSE-NEXT: pslld $16, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm5[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,1,0,3] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[3,0] +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,3] +; SSE-NEXT: pslld $16, %xmm7 ; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm9, %xmm7 -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm14[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm14[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm3[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm7[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm13[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm7[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3] +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm7, %xmm13 +; SSE-NEXT: por %xmm10, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm10, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm12[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm12, %xmm14 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm12[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: movdqa %xmm10, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm4[1] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm7[0],xmm6[1,2,3] +; SSE-NEXT: andps %xmm10, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: pandn %xmm7, %xmm11 +; SSE-NEXT: por %xmm6, %xmm11 ; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm7, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: psrlq $48, %xmm14 -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; SSE-NEXT: psrld $16, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm13[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm7, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movaps %xmm12, (%rsi) +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm4[1] +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm2[0],xmm5[1,2,3] +; SSE-NEXT: andps %xmm10, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7] +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: movaps %xmm3, (%rsi) ; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm4, (%r8) -; SSE-NEXT: movdqa %xmm5, (%r9) -; SSE-NEXT: movdqa %xmm7, (%rax) +; SSE-NEXT: movdqa %xmm14, (%rcx) +; SSE-NEXT: movdqa %xmm8, (%r8) +; SSE-NEXT: movdqa %xmm11, (%r9) +; SSE-NEXT: movdqa %xmm10, (%rax) ; SSE-NEXT: retq ; ; AVX1-LABEL: vf8: ; AVX1: # %bb.0: ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: vmovdqa (%rdi), %xmm8 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm1 @@ -495,67 +495,67 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,0,2,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,1,0,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,6,6,7] ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2],xmm3[3,4,5],xmm5[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3,4,5],xmm5[6,7] ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX1-NEXT: vpslld $16, %xmm5, %xmm10 +; AVX1-NEXT: vpslld $16, %xmm5, %xmm9 ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm3[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm10 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[1,1,1,1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX1-NEXT: vpsrld $16, %xmm2, %xmm7 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5],xmm0[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,2,3,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,5],xmm3[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm8[2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0,1,2],xmm0[3,4],xmm7[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3],xmm5[4,5],xmm6[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] -; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm0[5,6,7] -; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm12 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm12[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4],xmm3[5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm9 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm9 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4],xmm10[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm5[4,5],xmm6[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm11[5,6,7] +; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm11 +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[2,2,3,3] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm12[0],xmm11[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm11[3,4],xmm9[5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,5,4,6] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,5,4,6] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm1[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3,4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7] ; AVX1-NEXT: vpsrlq $48, %xmm2, %xmm2 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,7] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm4[1],xmm1[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX1-NEXT: vmovdqa %xmm9, (%rsi) -; AVX1-NEXT: vmovdqa %xmm10, (%rdx) -; AVX1-NEXT: vmovdqa %xmm11, (%rcx) -; AVX1-NEXT: vmovdqa %xmm0, (%r8) -; AVX1-NEXT: vmovdqa %xmm3, (%r9) -; AVX1-NEXT: vmovdqa %xmm1, (%rax) +; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX1-NEXT: vmovdqa %xmm3, (%rsi) +; AVX1-NEXT: vmovdqa %xmm7, (%rdx) +; AVX1-NEXT: vmovdqa %xmm8, (%rcx) +; AVX1-NEXT: vmovdqa %xmm9, (%r8) +; AVX1-NEXT: vmovdqa %xmm6, (%r9) +; AVX1-NEXT: vmovdqa %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: vf8: @@ -565,58 +565,58 @@ ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpslld $16, %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3],xmm1[4,5],xmm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpbroadcastw 74(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpbroadcastw 74(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1,2],xmm7[3],xmm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm0[2],xmm8[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2],xmm5[3],xmm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %xmm9, (%rsi) -; AVX2-SLOW-NEXT: vmovdqa %xmm10, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsi) +; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rdx) +; AVX2-SLOW-NEXT: vmovdqa %xmm8, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%r9) ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -628,151 +628,151 @@ ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-FAST-NEXT: vpslld $16, %xmm0, %xmm3 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3],xmm3[4,5],xmm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm3[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw 74(%rdi), %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm5[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2],xmm3[3],xmm5[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm0[2],xmm10[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vmovdqa %xmm8, (%rsi) -; AVX2-FAST-NEXT: vmovdqa %xmm9, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %xmm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %xmm3, (%r8) -; AVX2-FAST-NEXT: vmovdqa %xmm5, (%r9) +; AVX2-FAST-NEXT: vmovdqa %xmm3, (%rsi) +; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rdx) +; AVX2-FAST-NEXT: vmovdqa %xmm8, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %xmm6, (%r8) +; AVX2-FAST-NEXT: vmovdqa %xmm1, (%r9) ; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: vf8: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <3,9,15,u,u,u,u,u> ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm1 -; AVX512-NEXT: vpermw %zmm1, %zmm0, %zmm8 +; AVX512-NEXT: vpermw %zmm1, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <2,8,14,u,u,u,u,u> -; AVX512-NEXT: vpermw %zmm1, %zmm0, %zmm9 +; AVX512-NEXT: vpermw %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <1,7,13,u,u,u,u,u> ; AVX512-NEXT: vpermw %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <0,6,12,u,u,u,u,u> ; AVX512-NEXT: vpermw %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm8 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512-NEXT: vpextrw $2, %xmm1, %eax -; AVX512-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX512-NEXT: vpextrw $6, %xmm0, %eax -; AVX512-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 +; AVX512-NEXT: vpextrw $2, %xmm1, %r10d +; AVX512-NEXT: vpinsrw $3, %r10d, %xmm2, %xmm2 +; AVX512-NEXT: vmovd %xmm0, %r10d +; AVX512-NEXT: vpinsrw $4, %r10d, %xmm2, %xmm2 +; AVX512-NEXT: vpextrw $6, %xmm0, %r10d +; AVX512-NEXT: vpinsrw $5, %r10d, %xmm2, %xmm2 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512-NEXT: vpextrw $4, %xmm3, %eax -; AVX512-NEXT: vpinsrw $6, %eax, %xmm2, %xmm5 +; AVX512-NEXT: vpextrw $4, %xmm3, %r10d +; AVX512-NEXT: vpinsrw $6, %r10d, %xmm2, %xmm9 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512-NEXT: vpextrw $2, %xmm2, %eax -; AVX512-NEXT: vpinsrw $7, %eax, %xmm5, %xmm10 +; AVX512-NEXT: vpextrw $2, %xmm2, %edi +; AVX512-NEXT: vpinsrw $7, %edi, %xmm9, %xmm9 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm1[3],xmm6[4,5,6,7] -; AVX512-NEXT: vpextrw $1, %xmm0, %eax -; AVX512-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrw $7, %xmm0, %eax -; AVX512-NEXT: vpinsrw $5, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrw $5, %xmm3, %eax -; AVX512-NEXT: vpinsrw $6, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrw $3, %xmm2, %eax -; AVX512-NEXT: vpinsrw $7, %eax, %xmm6, %xmm11 -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: vpinsrw $3, %eax, %xmm9, %xmm5 -; AVX512-NEXT: vpextrw $2, %xmm0, %eax -; AVX512-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 -; AVX512-NEXT: vmovd %xmm3, %eax -; AVX512-NEXT: vpinsrw $5, %eax, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $1, %xmm0, %edi +; AVX512-NEXT: vpinsrw $4, %edi, %xmm6, %xmm6 +; AVX512-NEXT: vpextrw $7, %xmm0, %edi +; AVX512-NEXT: vpinsrw $5, %edi, %xmm6, %xmm6 +; AVX512-NEXT: vpextrw $5, %xmm3, %edi +; AVX512-NEXT: vpinsrw $6, %edi, %xmm6, %xmm6 +; AVX512-NEXT: vpextrw $3, %xmm2, %edi +; AVX512-NEXT: vpinsrw $7, %edi, %xmm6, %xmm6 +; AVX512-NEXT: vpextrw $4, %xmm1, %edi +; AVX512-NEXT: vpinsrw $3, %edi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $2, %xmm0, %edi +; AVX512-NEXT: vpinsrw $4, %edi, %xmm5, %xmm5 +; AVX512-NEXT: vmovd %xmm3, %edi +; AVX512-NEXT: vpinsrw $5, %edi, %xmm5, %xmm5 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7] -; AVX512-NEXT: vpextrw $4, %xmm2, %eax -; AVX512-NEXT: vpinsrw $7, %eax, %xmm5, %xmm9 -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: vpinsrw $3, %eax, %xmm8, %xmm6 -; AVX512-NEXT: vpextrw $3, %xmm0, %eax -; AVX512-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrw $1, %xmm3, %eax -; AVX512-NEXT: vpinsrw $5, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrw $7, %xmm3, %eax -; AVX512-NEXT: vpinsrw $6, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrw $5, %xmm2, %eax -; AVX512-NEXT: vpinsrw $7, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrw $2, %xmm4, %eax -; AVX512-NEXT: vpextrw $4, %xmm7, %edi -; AVX512-NEXT: vmovd %edi, %xmm5 -; AVX512-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: vpinsrw $3, %eax, %xmm5, %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm0[4],xmm5[5,6,7] -; AVX512-NEXT: vpextrw $2, %xmm3, %eax -; AVX512-NEXT: vpinsrw $5, %eax, %xmm5, %xmm5 -; AVX512-NEXT: vmovd %xmm2, %eax -; AVX512-NEXT: vpinsrw $6, %eax, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $6, %xmm2, %eax -; AVX512-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 -; AVX512-NEXT: vpextrw $3, %xmm4, %eax -; AVX512-NEXT: vpextrw $5, %xmm7, %edi -; AVX512-NEXT: vmovd %edi, %xmm4 -; AVX512-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $1, %xmm1, %eax -; AVX512-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: vpinsrw $3, %eax, %xmm4, %xmm1 -; AVX512-NEXT: vpextrw $5, %xmm0, %eax -; AVX512-NEXT: vpinsrw $4, %eax, %xmm1, %xmm0 -; AVX512-NEXT: vpextrw $3, %xmm3, %eax -; AVX512-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpextrw $1, %xmm2, %eax -; AVX512-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpextrw $4, %xmm2, %edi +; AVX512-NEXT: vpinsrw $7, %edi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $5, %xmm1, %edi +; AVX512-NEXT: vpinsrw $3, %edi, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $3, %xmm0, %edi +; AVX512-NEXT: vpinsrw $4, %edi, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $1, %xmm3, %edi +; AVX512-NEXT: vpinsrw $5, %edi, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $7, %xmm3, %edi +; AVX512-NEXT: vpinsrw $6, %edi, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $5, %xmm2, %edi +; AVX512-NEXT: vpinsrw $7, %edi, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $2, %xmm8, %edi +; AVX512-NEXT: vpextrw $4, %xmm7, %r10d +; AVX512-NEXT: vmovd %r10d, %xmm10 +; AVX512-NEXT: vpinsrw $1, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vmovd %xmm1, %edi +; AVX512-NEXT: vpinsrw $2, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrw $6, %xmm1, %edi +; AVX512-NEXT: vpinsrw $3, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6,7] +; AVX512-NEXT: vpextrw $2, %xmm3, %edi +; AVX512-NEXT: vpinsrw $5, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vmovd %xmm2, %edi +; AVX512-NEXT: vpinsrw $6, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrw $6, %xmm2, %edi +; AVX512-NEXT: vpinsrw $7, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrw $3, %xmm8, %edi +; AVX512-NEXT: vpextrw $5, %xmm7, %r10d +; AVX512-NEXT: vmovd %r10d, %xmm7 +; AVX512-NEXT: vpinsrw $1, %edi, %xmm7, %xmm7 +; AVX512-NEXT: vpextrw $1, %xmm1, %edi +; AVX512-NEXT: vpinsrw $2, %edi, %xmm7, %xmm7 +; AVX512-NEXT: vpextrw $7, %xmm1, %edi +; AVX512-NEXT: vpinsrw $3, %edi, %xmm7, %xmm1 +; AVX512-NEXT: vpextrw $5, %xmm0, %edi +; AVX512-NEXT: vpinsrw $4, %edi, %xmm1, %xmm0 +; AVX512-NEXT: vpextrw $3, %xmm3, %edi +; AVX512-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 +; AVX512-NEXT: vpextrw $1, %xmm2, %edi +; AVX512-NEXT: vpinsrw $6, %edi, %xmm0, %xmm0 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512-NEXT: vmovdqa %xmm10, (%rsi) -; AVX512-NEXT: vmovdqa %xmm11, (%rdx) -; AVX512-NEXT: vmovdqa %xmm9, (%rcx) -; AVX512-NEXT: vmovdqa %xmm6, (%r8) -; AVX512-NEXT: vmovdqa %xmm5, (%r9) -; AVX512-NEXT: vmovdqa %xmm0, (%r10) +; AVX512-NEXT: vmovdqa %xmm9, (%rsi) +; AVX512-NEXT: vmovdqa %xmm6, (%rdx) +; AVX512-NEXT: vmovdqa %xmm5, (%rcx) +; AVX512-NEXT: vmovdqa %xmm4, (%r8) +; AVX512-NEXT: vmovdqa %xmm10, (%r9) +; AVX512-NEXT: vmovdqa %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <48 x i16>, ptr %in.vec, align 32 @@ -797,630 +797,614 @@ define void @vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $72, %rsp -; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: movdqa 128(%rdi), %xmm11 +; SSE-NEXT: subq $104, %rsp +; SSE-NEXT: movdqa 112(%rdi), %xmm8 +; SSE-NEXT: movdqa 128(%rdi), %xmm12 ; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm12 +; SSE-NEXT: movdqa 80(%rdi), %xmm10 ; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa 48(%rdi), %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm12[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm10[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm12 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSE-NEXT: pslld $16, %xmm10 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: movdqa 160(%rdi), %xmm3 -; SSE-NEXT: movdqa 176(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3] -; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa 144(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm9[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,0] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; SSE-NEXT: pand %xmm13, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: movdqa 160(%rdi), %xmm4 +; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,3] +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: movdqa 144(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm9[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: psrld $16, %xmm15 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm15[2],xmm9[3],xmm15[3] +; SSE-NEXT: movdqa %xmm13, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm6[2,0] -; SSE-NEXT: psrld $16, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm11, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm14, %xmm15 +; SSE-NEXT: pand %xmm13, %xmm9 +; SSE-NEXT: por %xmm15, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm7[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm15[2,0] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm15 +; SSE-NEXT: pandn %xmm12, %xmm13 +; SSE-NEXT: por %xmm6, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, %xmm13 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm15[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm10, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm10, %xmm4 -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm2[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm2[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm15, (%rsp) # 16-byte Spill +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm3[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0,1],mem[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm1, %xmm11 ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm6 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa (%rsp), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm12[1] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm1[0],xmm6[1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm15 = xmm15[1],xmm13[1] +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm1[0],xmm15[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: andps %xmm7, %xmm6 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: andps %xmm12, %xmm15 +; SSE-NEXT: por %xmm15, %xmm2 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: pshufd $196, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: andps %xmm7, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: psrlq $48, %xmm15 -; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; SSE-NEXT: psrld $16, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm10[0],xmm1[1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm15[0],xmm0[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE-NEXT: psrld $16, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm6[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: andps %xmm7, %xmm1 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: psrld $16, %xmm13 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm10[0],xmm1[1,2,3] -; SSE-NEXT: andps %xmm7, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movaps %xmm8, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm12, %xmm15 +; SSE-NEXT: pandn %xmm6, %xmm15 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: psrld $16, %xmm8 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,5,7] +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movaps %xmm10, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps %xmm9, (%rdx) -; SSE-NEXT: movdqa %xmm5, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm4, 16(%r8) -; SSE-NEXT: movdqa %xmm11, (%r8) -; SSE-NEXT: movdqa %xmm0, 16(%r9) +; SSE-NEXT: movdqa %xmm11, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movdqa %xmm3, 16(%r8) +; SSE-NEXT: movdqa %xmm7, (%r8) +; SSE-NEXT: movdqa %xmm1, 16(%r9) ; SSE-NEXT: movdqa %xmm2, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm7, 16(%rax) -; SSE-NEXT: movdqa %xmm14, (%rax) -; SSE-NEXT: addq $72, %rsp +; SSE-NEXT: movdqa %xmm12, 16(%rax) +; SSE-NEXT: movdqa %xmm15, (%rax) +; SSE-NEXT: addq $104, %rsp ; SSE-NEXT: retq ; ; AVX1-LABEL: vf16: ; AVX1: # %bb.0: -; AVX1-NEXT: subq $184, %rsp +; AVX1-NEXT: subq $104, %rsp ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,3] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7] ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX1-NEXT: vpslld $16, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa %xmm2, %xmm10 -; AVX1-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vmovdqa (%rdi), %xmm11 +; AVX1-NEXT: vmovdqa %xmm1, %xmm7 +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpslld $16, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm5 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm8 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpsrlq $16, %xmm3, %xmm0 -; AVX1-NEXT: vmovdqa %xmm3, %xmm7 +; AVX1-NEXT: vpsrlq $16, %xmm8, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,0,2,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,1,0,3] -; AVX1-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3,4,5],xmm5[6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5],ymm2[6,7] -; AVX1-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpslld $16, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX1-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX1-NEXT: vpsrlq $16, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa %xmm1, %xmm9 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[0,1,0,2,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4,5],xmm11[6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2],ymm5[3,4,5],ymm10[6,7] +; AVX1-NEXT: vmovdqa 176(%rdi), %xmm10 +; AVX1-NEXT: vpslld $16, %xmm10, %xmm5 +; AVX1-NEXT: vmovdqa 160(%rdi), %xmm11 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm12 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; AVX1-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX1-NEXT: vpsrlq $16, %xmm12, %xmm0 ; AVX1-NEXT: vmovdqa 144(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,1,0,2,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm14[6,7] -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-NEXT: vandps %ymm2, %ymm4, %ymm4 +; AVX1-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-NEXT: vandps %ymm3, %ymm14, %ymm3 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vmovaps %ymm2, %ymm14 -; AVX1-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,7,6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpsrld $16, %xmm5, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,2,3,3] -; AVX1-NEXT: vmovdqa %xmm10, %xmm13 -; AVX1-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm7, %xmm2 ; AVX1-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX1-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpsrld $16, %xmm6, %xmm7 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5],xmm3[6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5],ymm3[6,7] +; AVX1-NEXT: vpsrld $16, %xmm7, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,1,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,1,1,1] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpsrld $16, %xmm6, %xmm9 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5],xmm2[6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,2,3,3] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX1-NEXT: vandps %ymm0, %ymm14, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vandnps %ymm1, %ymm14, %ymm1 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm11[2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3,4],xmm4[5,6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm5[0,1],xmm14[2,3],xmm5[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm13[4,5],xmm8[6,7] -; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm15 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm15, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm15 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-NEXT: vandps %ymm1, %ymm15, %ymm1 +; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm5 # 16-byte Folded Reload +; AVX1-NEXT: # xmm5 = xmm6[0,1],mem[2,3],xmm6[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4],xmm3[5,6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] +; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1,2,3],xmm13[4,5],xmm4[6,7] +; AVX1-NEXT: vpshufb %xmm9, %xmm15, %xmm13 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 +; AVX1-NEXT: vmovaps {{.*#+}} ymm13 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-NEXT: vandps %ymm1, %ymm13, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm13 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm13[0],xmm1[0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1,2,3],xmm12[4,5],xmm7[6,7] -; AVX1-NEXT: vpshufb %xmm5, %xmm13, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3,4],xmm5[5,6,7] -; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-NEXT: vandps %ymm0, %ymm8, %ymm0 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm7 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm7[0],xmm1[0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7] +; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm9 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4],xmm9[5,6,7] +; AVX1-NEXT: vandps %ymm0, %ymm14, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vandnps %ymm1, %ymm8, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm14, %ymm1 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa %xmm3, %xmm6 -; AVX1-NEXT: vpsrlq $48, %xmm3, %xmm1 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] -; AVX1-NEXT: vpshufb %xmm5, %xmm10, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] -; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-NEXT: vandnps %ymm1, %ymm15, %ymm1 -; AVX1-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-NEXT: vpsrlq $48, %xmm11, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[2,2,3,3] -; AVX1-NEXT: vmovdqa %xmm9, %xmm10 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm3[0] -; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-NEXT: vandps %ymm1, %ymm8, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-NEXT: vandnps %ymm3, %ymm8, %ymm3 -; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-NEXT: # xmm3 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,1,1,1] +; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovdqa %xmm2, %xmm0 +; AVX1-NEXT: vpsrlq $48, %xmm2, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,2,3,3] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm9[0],xmm1[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] +; AVX1-NEXT: vpshufb %xmm3, %xmm15, %xmm9 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm9, %ymm2 +; AVX1-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-NEXT: vandps %ymm2, %ymm13, %ymm2 +; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vpsrlq $48, %xmm4, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm9[0],xmm2[0] +; AVX1-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] +; AVX1-NEXT: vandps %ymm1, %ymm14, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-NEXT: vorps %ymm2, %ymm1, %ymm14 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,4,6] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm6[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3,4,5,6,7] -; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-NEXT: vorps %ymm1, %ymm5, %ymm1 -; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: # xmm5 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,4,6] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX1-NEXT: vpsrlq $48, %xmm9, %xmm1 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vpsrlq $48, %xmm8, %xmm3 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX1-NEXT: vpsrld $16, %xmm6, %xmm3 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,5,5,7] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm6[1],xmm3[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7] -; AVX1-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-NEXT: vandps %ymm7, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm11, %xmm3 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,5,5,7] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm1, (%r8) -; AVX1-NEXT: vmovaps %ymm12, (%r9) +; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,3,2,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,4,5,4,6] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3,4,5,6,7] +; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vandnps %ymm1, %ymm9, %ymm1 +; AVX1-NEXT: vandps %ymm7, %ymm9, %ymm7 +; AVX1-NEXT: vorps %ymm1, %ymm7, %ymm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[0,1,0,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,5,4,6] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3,4],xmm3[5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX1-NEXT: vpsrlq $48, %xmm5, %xmm3 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm11 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] +; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vpsrlq $48, %xmm4, %xmm3 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm4 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,5,5,7] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] +; AVX1-NEXT: vandnps %ymm2, %ymm9, %ymm2 +; AVX1-NEXT: vandps %ymm3, %ymm9, %ymm3 +; AVX1-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-NEXT: vpshufb %xmm11, %xmm7, %xmm3 +; AVX1-NEXT: vpsrld $16, %xmm12, %xmm4 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm10[0,1,2,3,4,5,5,7] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-NEXT: vmovaps %ymm14, (%r8) +; AVX1-NEXT: vmovaps %ymm1, (%r9) ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: vmovaps %ymm0, (%rax) -; AVX1-NEXT: addq $184, %rsp +; AVX1-NEXT: vmovaps %ymm2, (%rax) +; AVX1-NEXT: addq $104, %rsp ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm15[2],ymm1[3,4],ymm15[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm2[2,3],ymm5[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[0,2,2,1,4,6,6,5] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,2,2,1,4,6,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm7[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm2[0,1],ymm5[0,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[0,3,2,3,4,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4,5,6],ymm11[7],ymm2[8,9],ymm11[10],ymm2[11,12,13,14],ymm11[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3],xmm4[4,5],xmm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,1,2,1,6,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4,5,6],ymm1[7],ymm4[8,9],ymm1[10],ymm4[11,12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,0,3,5,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,6,7,8,9,10,11,12,12,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,3,2,3,4,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3,4],ymm4[5,6],ymm8[7],ymm4[8],ymm8[9,10,11,12],ymm4[13,14],ymm8[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,1,2,3] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3,4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[2,1,2,1,6,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4,5,6],ymm9[7],ymm5[8,9],ymm9[10],ymm5[11,12,13,14],ymm9[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3],xmm9[4,5],xmm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[1,1,0,3,5,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,6,7,8,9,10,11,12,12,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1,2,3,4],ymm10[5,6],ymm13[7],ymm10[8],ymm13[9,10,11,12],ymm10[13,14],ymm13[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[2,1,2,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm15[1,2],xmm2[3],xmm15[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7],ymm10[8,9,10],ymm8[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm10[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6],xmm2[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm12[0,1,3,3,4,5,6,7,8,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3,4],ymm9[5,6],ymm10[7],ymm9[8],ymm10[9,10,11,12],ymm9[13,14],ymm10[15] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7],ymm9[8,9,10],ymm2[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2,3],xmm4[4],xmm10[5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,2,1,6,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm7[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3,4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10,11,12],ymm10[13],ymm11[14,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2,3,4],ymm1[5],ymm6[6,7],ymm1[8],ymm6[9,10,11,12],ymm1[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,1,0,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,3,3,4,5,6,7,8,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3,4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9,10,11,12],ymm1[13,14],ymm3[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,1,2,1,6,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3,4],ymm4[5],ymm6[6,7],ymm4[8],ymm6[9,10,11,12],ymm4[13],ymm6[14,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3,4],ymm3[5],ymm5[6,7],ymm3[8],ymm5[9,10,11,12],ymm3[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rdx) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) @@ -1431,125 +1415,121 @@ ; ; AVX2-FAST-LABEL: vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm2[2,3],ymm5[2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[2,1,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[2,1,2,1,6,5,6,5] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,12,13,u,u,u,u,16,17,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[0,3,2,3,4,7,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,u,u,16,17,20,21,u,u,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm12[2],ymm2[3,4,5,6],ymm12[7],ymm2[8,9],ymm12[10],ymm2[11,12,13,14],ymm12[15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3],xmm4[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm11[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,u,u,18,19,22,23,u,u,22,23,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4,5,6],ymm1[7],ymm4[8,9],ymm1[10],ymm4[11,12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm7[4,5,u,u,u,u,u,u,u,u,0,1,12,13,u,u,20,21,u,u,u,u,u,u,u,u,16,17,28,29,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,3,2,3,4,7,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm9[u,u,0,1,4,5,6,7,8,9,u,u,u,u,8,9,u,u,16,17,20,21,22,23,24,25,u,u,u,u,24,25] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3,4],ymm4[5,6],ymm8[7],ymm4[8],ymm8[9,10,11,12],ymm4[13,14],ymm8[15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0,1,2],ymm6[3,4,5,6,7],ymm4[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,2,3,6,7,6,7,10,11,u,u,u,u,10,11,u,u,18,19,22,23,22,23,26,27,u,u,u,u,26,27] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3,4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9,10,11,12],ymm1[13,14],ymm3[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[2,1,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,1,0,3,4,5,4,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,4,5,4,5,4,5,8,9,u,u,8,9,12,13,u,u,20,21,20,21,20,21,24,25,u,u,24,25,28,29] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3,4],ymm1[5],ymm3[6,7],ymm1[8],ymm3[9,10,11,12],ymm1[13],ymm3[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3],xmm15[4,5],xmm13[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm11[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,u,u,18,19,22,23,u,u,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4,5,6],ymm9[7],ymm5[8,9],ymm9[10],ymm5[11,12,13,14],ymm9[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[4,5,u,u,u,u,u,u,u,u,0,1,12,13,u,u,20,21,u,u,u,u,u,u,u,u,16,17,28,29,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[0,3,2,3,4,7,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm12[u,u,0,1,4,5,6,7,8,9,u,u,u,u,8,9,u,u,16,17,20,21,22,23,24,25,u,u,u,u,24,25] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1,2,3,4],ymm10[5,6],ymm13[7],ymm10[8],ymm13[9,10,11,12],ymm10[13,14],ymm13[15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[2,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm2[1,2],xmm15[3],xmm2[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7],ymm10[8,9,10],ymm8[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm10[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6],xmm2[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,2,3,6,7,6,7,10,11,u,u,u,u,10,11,u,u,18,19,22,23,22,23,26,27,u,u,u,u,26,27] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3,4],ymm9[5,6],ymm10[7],ymm9[8],ymm10[9,10,11,12],ymm9[13,14],ymm10[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7],ymm9[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,1,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm2[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[u,u,4,5,4,5,4,5,8,9,u,u,8,9,12,13,u,u,20,21,20,21,20,21,24,25,u,u,24,25,28,29] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1,2,3,4],ymm7[5],ymm10[6,7],ymm7[8],ymm10[9,10,11,12],ymm7[13],ymm10[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6],ymm11[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,u,u,u,u,u,u,u,u,14,15,u,u,u,u,18,19,u,u,u,u,u,u,u,u,30,31,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,6,7,6,7,6,7,8,9,u,u,10,11,14,15,u,u,22,23,22,23,22,23,24,25,u,u,26,27,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10,11,12],ymm0[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[2,3,u,u,u,u,u,u,u,u,14,15,u,u,u,u,18,19,u,u,u,u,u,u,u,u,30,31,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,6,7,6,7,6,7,8,9,u,u,10,11,14,15,u,u,22,23,22,23,22,23,24,25,u,u,26,27,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10,11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5],xmm2[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5],xmm2[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-NEXT: vzeroupper @@ -1633,231 +1613,233 @@ ; SSE-LABEL: vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 -; SSE-NEXT: movdqa 208(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa 80(%rdi), %xmm10 ; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm14 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm10[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm7 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,3,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSE-NEXT: pslld $16, %xmm10 +; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 192(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm10 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa 256(%rdi), %xmm4 ; SSE-NEXT: movdqa 272(%rdi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm12 ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 160(%rdi), %xmm5 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa 160(%rdi), %xmm4 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa 304(%rdi), %xmm15 -; SSE-NEXT: movdqa 288(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa 352(%rdi), %xmm12 +; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa 368(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[3,0] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm9 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm11[2,3] ; SSE-NEXT: pslld $16, %xmm11 ; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: psrld $16, %xmm7 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm11[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pandn %xmm14, %xmm9 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm9[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufhw $237, (%rsp), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm9, %xmm0 ; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: por %xmm7, %xmm11 -; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0] +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm12 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: psrld $16, %xmm7 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm6[2,0] +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,0] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pandn %xmm14, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pandn %xmm7, %xmm10 ; SSE-NEXT: por %xmm2, %xmm10 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] @@ -1865,156 +1847,154 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm11[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm6[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm15[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm15[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm6 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm4[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm7, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm6[0] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm8[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: psrlq $48, %xmm9 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm9[0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: pandn %xmm3, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm4, %xmm11 +; SSE-NEXT: psrlq $48, %xmm14 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm14[0] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] @@ -2024,32 +2004,31 @@ ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm10[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: andps %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] @@ -2062,9 +2041,9 @@ ; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: andps %xmm10, %xmm2 +; SSE-NEXT: andps %xmm12, %xmm2 ; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[1,1,1,1] @@ -2075,16 +2054,15 @@ ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1],mem[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm9[1] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: andps %xmm10, %xmm2 +; SSE-NEXT: andps %xmm12, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] @@ -2098,84 +2076,85 @@ ; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[1],mem[1] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: andps %xmm10, %xmm1 +; SSE-NEXT: andps %xmm12, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: psrld $16, %xmm10 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: andps %xmm10, %xmm2 +; SSE-NEXT: andps %xmm12, %xmm2 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: psrlq $48, %xmm15 +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm9 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: andps %xmm10, %xmm2 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm9[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm13[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm12, %xmm15 +; SSE-NEXT: pandn %xmm8, %xmm15 +; SSE-NEXT: andps %xmm12, %xmm2 ; SSE-NEXT: por %xmm2, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm9 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm9[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm8[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: pandn %xmm8, %xmm13 +; SSE-NEXT: andps %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm9 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm9[1] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm8[0],xmm2[1,2,3] -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm12[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm6, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: andps %xmm12, %xmm2 +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] +; SSE-NEXT: pandn %xmm8, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -2186,7 +2165,8 @@ ; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps %xmm13, 16(%rdx) +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -2195,11 +2175,11 @@ ; SSE-NEXT: movaps %xmm2, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm14, 48(%r8) +; SSE-NEXT: movdqa %xmm7, 48(%r8) ; SSE-NEXT: movdqa %xmm11, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) @@ -2208,10 +2188,10 @@ ; SSE-NEXT: movdqa %xmm3, 48(%r9) ; SSE-NEXT: movdqa %xmm4, 16(%r9) ; SSE-NEXT: movdqa %xmm5, 32(%r9) -; SSE-NEXT: movdqa %xmm9, (%r9) +; SSE-NEXT: movdqa %xmm6, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm10, 48(%rax) -; SSE-NEXT: movdqa %xmm7, 16(%rax) +; SSE-NEXT: movdqa %xmm12, 48(%rax) +; SSE-NEXT: movdqa %xmm13, 16(%rax) ; SSE-NEXT: movdqa %xmm15, 32(%rax) ; SSE-NEXT: movdqa %xmm1, (%rax) ; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 @@ -2219,7 +2199,7 @@ ; ; AVX1-LABEL: vf32: ; AVX1: # %bb.0: -; AVX1-NEXT: subq $584, %rsp # imm = 0x248 +; AVX1-NEXT: subq $536, %rsp # imm = 0x218 ; AVX1-NEXT: vmovdqa 288(%rdi), %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] @@ -2235,154 +2215,154 @@ ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpsrlq $16, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpsrlq $16, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa 240(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX1-NEXT: vmovdqa 208(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] -; AVX1-NEXT: vmovdqa 368(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpsrlq $16, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa 336(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,0,2,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm9 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5],ymm2[6,7] +; AVX1-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpslld $16, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa 352(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpsrlq $16, %xmm0, %xmm4 +; AVX1-NEXT: vmovdqa 336(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,1,0,2,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5],xmm2[6,7] +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-NEXT: vorps %ymm2, %ymm1, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX1-NEXT: vpslld $16, %xmm12, %xmm0 -; AVX1-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm11 +; AVX1-NEXT: vpslld $16, %xmm11, %xmm1 ; AVX1-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-NEXT: vmovdqa 112(%rdi), %xmm14 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm15 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm13 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm5 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm8 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpsrlq $16, %xmm13, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,0,2,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX1-NEXT: vpsrlq $16, %xmm8, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,1,0,2,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,6,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4,5],xmm7[6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm15[3,4,5],ymm5[6,7] -; AVX1-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpslld $16, %xmm0, %xmm5 -; AVX1-NEXT: vmovdqa 160(%rdi), %xmm10 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm7 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX1-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX1-NEXT: vpsrlq $16, %xmm6, %xmm7 -; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3,4,5],xmm9[6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5],ymm6[6,7] +; AVX1-NEXT: vmovdqa 176(%rdi), %xmm14 +; AVX1-NEXT: vpslld $16, %xmm14, %xmm6 +; AVX1-NEXT: vmovdqa 160(%rdi), %xmm7 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm9 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX1-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX1-NEXT: vpsrlq $16, %xmm3, %xmm9 ; AVX1-NEXT: vmovdqa 144(%rdi), %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,1,0,2,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5],xmm5[6,7] -; AVX1-NEXT: vandps %ymm4, %ymm9, %ymm4 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-NEXT: vandnps %ymm5, %ymm9, %ymm5 -; AVX1-NEXT: vmovaps %ymm9, %ymm7 -; AVX1-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX1-NEXT: vpsrld $16, %xmm14, %xmm4 -; AVX1-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[2,2,3,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm13, %xmm11 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,5],xmm6[6,7] +; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-NEXT: vandps %ymm5, %ymm9, %ymm5 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-NEXT: vandnps %ymm6, %ymm9, %ymm6 +; AVX1-NEXT: vorps %ymm6, %ymm5, %ymm5 +; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] ; AVX1-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; AVX1-NEXT: vpsrld $16, %xmm8, %xmm4 -; AVX1-NEXT: vmovdqa %xmm8, %xmm9 +; AVX1-NEXT: vpsrld $16, %xmm13, %xmm5 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,2,3,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,1,1,1] +; AVX1-NEXT: vmovdqa %xmm8, %xmm10 ; AVX1-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5],ymm1[6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; AVX1-NEXT: vmovdqa %xmm12, %xmm9 +; AVX1-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpsrld $16, %xmm12, %xmm5 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4,5],xmm4[6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5],ymm2[6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; AVX1-NEXT: vmovdqa %xmm3, %xmm15 +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,2,3,3] -; AVX1-NEXT: vmovdqa %xmm10, %xmm13 -; AVX1-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] +; AVX1-NEXT: vmovdqa %xmm14, %xmm8 +; AVX1-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7] -; AVX1-NEXT: vandps %ymm7, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm0, %ymm7, %ymm0 -; AVX1-NEXT: vmovaps %ymm7, %ymm4 +; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovaps %ymm2, %ymm3 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vpsrld $16, %xmm7, %xmm1 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX1-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: # xmm2 = mem[0,1,2,3,5,7,6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpsrld $16, %xmm5, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-NEXT: vpsrld $16, %xmm14, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2390,785 +2370,770 @@ ; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] +; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] -; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: vpblendw $12, (%rsp), %xmm9, %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: # xmm2 = xmm9[0,1],mem[2,3],xmm9[4,5,6,7] ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-NEXT: vpblendw $12, (%rsp), %xmm14, %xmm2 # 16-byte Folded Reload -; AVX1-NEXT: # xmm2 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] +; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload +; AVX1-NEXT: # xmm11 = xmm13[0,1],mem[2,3],xmm13[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5],mem[6,7] -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload +; AVX1-NEXT: # xmm13 = mem[0,1,2,3],xmm3[4,5],mem[6,7] +; AVX1-NEXT: vpshufb %xmm9, %xmm13, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vmovaps {{.*#+}} ymm12 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-NEXT: vandnps %ymm1, %ymm12, %ymm1 -; AVX1-NEXT: vandps %ymm2, %ymm12, %ymm2 +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm10[4,5],xmm13[6,7] -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm4 +; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: # xmm4 = mem[0,1,2,3],xmm8[4,5],mem[6,7] +; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm9, %xmm4, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7] -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-NEXT: vandnps %ymm2, %ymm3, %ymm2 -; AVX1-NEXT: vmovaps %ymm3, %ymm4 +; AVX1-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload -; AVX1-NEXT: # xmm14 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm2 +; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: # xmm4 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm2 # 16-byte Folded Reload -; AVX1-NEXT: # xmm2 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5],mem[6,7] -; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm13 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 -; AVX1-NEXT: vandnps %ymm15, %ymm12, %ymm13 -; AVX1-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-NEXT: vorps %ymm0, %ymm13, %ymm13 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-NEXT: vpsrldq {{.*#+}} xmm7 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm6[0,1,2,3],xmm8[4,5],xmm6[6,7] -; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] -; AVX1-NEXT: vandps %ymm4, %ymm13, %ymm7 +; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: # xmm2 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm6[4,5],xmm7[6,7] +; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm14 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 +; AVX1-NEXT: vandnps %ymm15, %ymm3, %ymm14 +; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm0, %ymm14, %ymm14 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vpsrldq {{.*#+}} xmm15 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; AVX1-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm5[0,1,2,3],mem[4,5],xmm5[6,7] +; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4],xmm5[5,6,7] +; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-NEXT: vandps %ymm6, %ymm14, %ymm14 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-NEXT: vandnps %ymm5, %ymm4, %ymm5 -; AVX1-NEXT: vmovaps %ymm4, %ymm13 -; AVX1-NEXT: vorps %ymm5, %ymm7, %ymm0 -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vandnps %ymm5, %ymm6, %ymm5 +; AVX1-NEXT: vorps %ymm5, %ymm14, %ymm5 +; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-NEXT: vpsrlq $48, %xmm15, %xmm5 +; AVX1-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-NEXT: # xmm14 = mem[2,2,3,3] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm14[0],xmm5[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm14, %xmm6, %xmm9 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3,4],xmm9[5,6,7] +; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] +; AVX1-NEXT: vpshufb %xmm9, %xmm13, %xmm13 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7 +; AVX1-NEXT: vandnps %ymm5, %ymm3, %ymm5 +; AVX1-NEXT: vandps %ymm3, %ymm7, %ymm7 +; AVX1-NEXT: vorps %ymm5, %ymm7, %ymm5 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-NEXT: vpsrlq $48, %xmm11, %xmm5 -; AVX1-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-NEXT: # xmm7 = mem[2,2,3,3] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm10, %xmm0, %xmm8 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-NEXT: vandnps %ymm5, %ymm12, %ymm5 -; AVX1-NEXT: vandps %ymm6, %ymm12, %ymm6 -; AVX1-NEXT: vorps %ymm5, %ymm6, %ymm5 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-NEXT: vpsrlq $48, %xmm8, %xmm6 -; AVX1-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-NEXT: # xmm4 = mem[2,2,3,3] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm10, %xmm0, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3,4],xmm6[5,6,7] -; AVX1-NEXT: vmovaps %ymm13, %ymm6 -; AVX1-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-NEXT: vandnps %ymm4, %ymm13, %ymm4 -; AVX1-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpsrlq $48, %xmm11, %xmm7 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,2,3,3] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm13[0],xmm7[0] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7] +; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-NEXT: vandps %ymm7, %ymm5, %ymm5 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-NEXT: vandnps %ymm6, %ymm7, %ymm6 +; AVX1-NEXT: vorps %ymm6, %ymm5, %ymm5 +; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-NEXT: vpsrlq $48, %xmm13, %xmm4 -; AVX1-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: # xmm5 = mem[2,2,3,3] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0] -; AVX1-NEXT: vpshufb %xmm7, %xmm14, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $48, %xmm13, %xmm5 +; AVX1-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-NEXT: # xmm6 = mem[2,2,3,3] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] +; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3,4],xmm4[5,6,7] +; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vandnps %ymm4, %ymm12, %ymm2 -; AVX1-NEXT: vandps %ymm1, %ymm12, %ymm1 +; AVX1-NEXT: vandnps %ymm4, %ymm3, %ymm2 +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vpsrlq $48, %xmm3, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,3,3] +; AVX1-NEXT: vmovdqa %xmm10, %xmm14 +; AVX1-NEXT: vpsrlq $48, %xmm10, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,2,3,3] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vpshufb %xmm10, %xmm15, %xmm0 +; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] -; AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm7, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-NEXT: vandnps %ymm0, %ymm7, %ymm0 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-NEXT: vpshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: # xmm1 = mem[2,3,2,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] -; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; AVX1-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: # xmm2 = mem[2,3,2,3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-NEXT: # xmm2 = mem[0,1,0,3] -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-NEXT: # xmm3 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-NEXT: # xmm14 = mem[0,1,0,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,4,6] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1] -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-NEXT: # xmm12 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm5, %xmm12, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,3,2,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX1-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: # xmm1 = mem[0,1,0,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,6] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm13[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3,4,5,6,7] -; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm3 +; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-NEXT: # xmm6 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] +; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: # xmm2 = mem[1,1,1,1] +; AVX1-NEXT: vpshufd $238, (%rsp), %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: # xmm4 = mem[2,3,2,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-NEXT: # xmm7 = mem[0,1,0,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,4,6] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm15[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm4, %ymm4 -; AVX1-NEXT: vorps %ymm3, %ymm4, %ymm6 +; AVX1-NEXT: vorps %ymm1, %ymm4, %ymm1 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-NEXT: # xmm10 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm5, %xmm10, %xmm7 -; AVX1-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-NEXT: # xmm4 = mem[0,1,0,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,4,6] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm5[5,6,7] +; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: # xmm4 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,1,0,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5,4,6] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm11[1] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm9 +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm8[5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,3,2,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm5 -; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm13, %xmm0, %xmm7 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-NEXT: # xmm8 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm3, %xmm8, %xmm10 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-NEXT: # xmm10 = mem[1,1,1,1] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-NEXT: # xmm10 = mem[0,1,0,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,4,6] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm13[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3,4,5,6,7] +; AVX1-NEXT: vandnps %ymm9, %ymm2, %ymm9 +; AVX1-NEXT: vandps %ymm2, %ymm11, %ymm11 +; AVX1-NEXT: vorps %ymm9, %ymm11, %ymm11 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-NEXT: # xmm9 = mem[0,1],xmm9[2,3],mem[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX1-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-NEXT: # xmm13 = mem[0,1,0,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5,4,6] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm14[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4],xmm3[5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5,6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vpsrlq $48, %xmm11, %xmm11 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vpsrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] +; AVX1-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vpsrlq $48, %xmm12, %xmm12 +; AVX1-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX1-NEXT: vpsrldq {{.*#+}} xmm14 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-NEXT: vpsrld $16, %xmm14, %xmm14 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,7] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm14[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3,4,5,6,7] +; AVX1-NEXT: vandnps %ymm6, %ymm2, %ymm6 +; AVX1-NEXT: vandps %ymm2, %ymm7, %ymm7 +; AVX1-NEXT: vorps %ymm6, %ymm7, %ymm6 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vpsrld $16, %xmm7, %xmm7 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,7] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm7[1] +; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7] +; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm5 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-NEXT: vpshufb %xmm11, %xmm8, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm6 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm7 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm7 -; AVX1-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,5,5,7] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm8[1],xmm7[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7] ; AVX1-NEXT: vandnps %ymm5, %ymm2, %ymm5 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm5, %ymm0, %ymm0 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpsrld $16, %xmm5, %xmm5 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5,5,7] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm6[1],xmm5[1] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX1-NEXT: vandps %ymm2, %ymm6, %ymm2 +; AVX1-NEXT: vorps %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm5 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm6 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 -; AVX1-NEXT: vpsrlq $48, %xmm9, %xmm6 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm7 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vpsrld $16, %xmm7, %xmm7 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3,4,5,6,7] -; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vpshufb %xmm13, %xmm10, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm3, %xmm2 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm1, (%r8) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm1, 32(%r9) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm1, (%r9) +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm6 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm13[0,1,2,3,4,5,5,7] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, (%r8) +; AVX1-NEXT: vmovaps %ymm3, 32(%r9) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, (%r9) ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-NEXT: vmovaps %ymm5, (%rax) -; AVX1-NEXT: addq $584, %rsp # imm = 0x248 +; AVX1-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-NEXT: vmovaps %ymm4, (%rax) +; AVX1-NEXT: addq $536, %rsp # imm = 0x218 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $520, %rsp # imm = 0x208 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[0,2,2,1,4,6,6,5] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,1,4,6,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,3,2,3,4,7,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4,5,6],ymm4[7],ymm2[8,9],ymm4[10],ymm2[11,12,13,14],ymm4[15] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm0[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,2,2,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,1,2,1,6,5,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[2,1,2,1,6,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm13[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0],ymm11[1,2,3,4],ymm0[5],ymm11[6,7],ymm0[8],ymm11[9,10,11,12],ymm0[13],ymm11[14,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm11, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm11, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm14 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm4[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6,7] +; AVX2-SLOW-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm14[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-SLOW-NEXT: vpshufd $102, (%rsp), %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[2,1,2,1,6,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm0[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm7[1,2,3,4],ymm15[5],ymm7[6,7],ymm15[8],ymm7[9,10,11,12],ymm15[13],ymm7[14,15] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[2,1,2,1,6,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm0[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7],ymm8[8],ymm7[9,10,11,12],ymm8[13],ymm7[14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm15, %ymm7, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3,4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10,11,12],ymm6[13],ymm7[14,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm13[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3,4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10,11,12],ymm7[13],ymm8[14,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm7, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10,11,12],ymm0[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,2,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,0,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm5[1],ymm11[2,3],ymm5[4],ymm11[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> ; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX2-SLOW-NEXT: vpblendd $56, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm7 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3],xmm7[4,5],xmm1[6],xmm7[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm6 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2],xmm1[3],xmm6[4,5],xmm1[6],xmm6[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[0,2,2,1,4,6,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] -; AVX2-SLOW-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0,3,2,3,4,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm8 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm13[3],xmm8[4,5],xmm13[6],xmm8[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[0,2,2,1,4,6,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0,1],ymm6[2],ymm11[3,4,5,6],ymm6[7],ymm11[8,9],ymm6[10],ymm11[11,12,13,14],ymm6[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1],xmm1[2],xmm10[3],xmm1[4,5],xmm10[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4,5,6],ymm13[7],ymm11[8,9],ymm13[10],ymm11[11,12,13,14],ymm13[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm15[0],mem[1],ymm15[2,3],mem[4],ymm15[5,6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm10 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2],xmm10[3],xmm14[4,5],xmm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7],ymm10[8,9,10],ymm8[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,1,2,1,6,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1],ymm1[2],ymm8[3,4,5,6],ymm1[7],ymm8[8,9],ymm1[10],ymm8[11,12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm10[2,1,2,1,6,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4,5,6],ymm8[7],ymm11[8,9],ymm8[10],ymm11[11,12,13,14],ymm8[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3],xmm6[4,5],xmm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,2,1,6,5,6,5] -; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4,5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11,12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7],ymm2[8,9,10],ymm4[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[2,1,2,1,6,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11,12,13,14],ymm2[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm2[3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd $197, (%rsp), %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[1,1,0,3,5,5,4,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[1,1,0,3,5,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,7,8,9,10,11,12,12,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm10[0,3,2,3,4,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3,4],ymm0[5,6],ymm4[7],ymm0[8],ymm4[9,10,11,12],ymm0[13,14],ymm4[15] -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3,4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9,10,11,12],ymm0[13,14],ymm3[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,5,6,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4],xmm5[5,6],xmm7[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1,2],xmm13[3],xmm8[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3,4],xmm0[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] +; AVX2-SLOW-NEXT: vpblendd $219, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3],xmm0[4],xmm5[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[1,1,0,3,5,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,6,7,8,9,10,11,12,12,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm1[0,3,2,3,4,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm13[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm14[4],xmm5[5,6],xmm14[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[1,1,0,3,5,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,4,6,7,8,9,10,11,12,12,14,15] +; AVX2-SLOW-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm1[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1,2,3,4],ymm5[5,6],ymm10[7],ymm5[8],ymm10[9,10,11,12],ymm5[13,14],ymm10[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,1,2,3] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7],ymm5[8,9,10],ymm4[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u> -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm15[0,1,3,3,4,5,6,7,8,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3,4],ymm4[5,6],ymm9[7],ymm4[8],ymm9[9,10,11,12],ymm4[13,14],ymm9[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5,6],xmm5[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm10[1,2,3,4],ymm14[5,6],ymm10[7],ymm14[8],ymm10[9,10,11,12],ymm14[13,14],ymm10[15] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,1,2,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm8[1,2],xmm15[3],xmm8[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7],ymm10[8,9,10],ymm5[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u> +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm9, %ymm10 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,3,4,5,6,7,8,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm4[1,2,3,4],ymm10[5,6],ymm4[7],ymm10[8],ymm4[9,10,11,12],ymm10[13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5,6],xmm6[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1,2],xmm5[3],xmm7[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4],xmm4[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,7,5,6,5] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,6,7,8,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3,4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9,10,11,12],ymm3[13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5,6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm12, %ymm4 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,3,3,4,5,6,7,8,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4],ymm4[5,6],ymm1[7],ymm4[8],ymm1[9,10,11,12],ymm4[13,14],ymm1[15] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2],xmm0[3],xmm4[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $146, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,0,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5],xmm7[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-SLOW-NEXT: addq $520, %rsp # imm = 0x208 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $568, %rsp # imm = 0x238 +; AVX2-FAST-NEXT: subq $552, %rsp # imm = 0x228 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm2[2,3],ymm3[2,3] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm3[0,1] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[2,3],ymm3[2,3] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm2[0,1],ymm3[0,1] ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm1[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm1[0,1],ymm0[0,1] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm1[0,1],ymm0[0,1] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,1,0,3,4,5,4,7] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,1,0,3,4,5,4,7] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[2,1,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[2,1,2,1,6,5,6,5] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7],ymm2[8],ymm0[9,10,11,12],ymm2[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7],ymm2[8],ymm0[9,10,11,12],ymm2[13],ymm0[14,15] ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,0,3,4,5,4,7] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[2,1,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3,4],ymm4[5],ymm1[6,7],ymm4[8],ymm1[9,10,11,12],ymm4[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm10[1],xmm0[2,3],xmm10[4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,1,0,3,4,5,4,7] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,1,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm13[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm4[1,2,3,4],ymm10[5],ymm4[6,7],ymm10[8],ymm4[9,10,11,12],ymm10[13],ymm4[14,15] +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm6 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm1 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6,7] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm15 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0],xmm6[1],xmm15[2,3],xmm6[4],xmm15[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,3,u,u,u,u,u,u,u,u,14,15,u,u,u,u,18,19,u,u,u,u,u,u,u,u,30,31,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,3,u,u,u,u,u,u,u,u,14,15,u,u,u,u,18,19,u,u,u,u,u,u,u,u,30,31,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1,2,3,4],ymm3[5],ymm7[6,7],ymm3[8],ymm7[9,10,11,12],ymm3[13],ymm7[14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10,11,12],ymm0[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4],ymm3[5],ymm1[6,7],ymm3[8],ymm1[9,10,11,12],ymm3[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3],xmm8[4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10,11,12],ymm1[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,1,2,1,6,5,6,5] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,3,2,3,4,7,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0,1],ymm1[2],ymm6[3,4,5,6],ymm1[7],ymm6[8,9],ymm1[10],ymm6[11,12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5,6],ymm11[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3],xmm5[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,3,2,3,4,7,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1,2],xmm12[3],xmm15[4,5],xmm12[6],xmm15[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,1,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[0,3,2,3,4,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1],ymm1[2],ymm12[3,4,5,6],ymm1[7],ymm12[8,9],ymm1[10],ymm12[11,12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm9 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm11 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3],xmm8[4,5],xmm9[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm1[3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm1[2],ymm10[3,4],ymm1[5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,8,9,u,u,16,17,20,21,u,u,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4,5,6],ymm1[7],ymm6[8,9],ymm1[10],ymm6[11,12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpblendd $109, (%rsp), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5,6],ymm11[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm14, %xmm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3],xmm13[4,5],xmm12[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm12 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm0[2,1,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,3,2,3,4,7,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,8,9,u,u,16,17,20,21,u,u,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1],ymm1[2],ymm12[3,4,5,6],ymm1[7],ymm12[8,9],ymm1[10],ymm12[11,12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5,6],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm15 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm10 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1],xmm10[2],xmm15[3],xmm10[4,5],xmm15[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm15 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm6 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3],xmm14[4,5],xmm2[6],xmm14[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm14[0,1],ymm4[2],ymm14[3,4,5,6],ymm4[7],ymm14[8,9],ymm4[10],ymm14[11,12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm15[3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3],xmm13[4,5],xmm6[6],xmm13[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4,5,6],ymm11[7],ymm2[8,9],ymm11[10],ymm2[11,12,13,14],ymm11[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm4[2],xmm12[3],xmm4[4,5],xmm12[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] @@ -3177,169 +3142,167 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm0 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4,5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11,12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7],ymm4[8,9],ymm5[10],ymm4[11,12,13,14],ymm5[15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[0,3,2,3,4,7,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <4,5,u,u,u,u,u,u,u,u,0,1,12,13,u,u,20,21,u,u,u,u,u,u,u,u,16,17,28,29,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3,4],ymm4[5,6],ymm6[7],ymm4[8],ymm6[9,10,11,12],ymm4[13,14],ymm6[15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,3,2,3,4,7,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <4,5,u,u,u,u,u,u,u,u,0,1,12,13,u,u,20,21,u,u,u,u,u,u,u,u,16,17,28,29,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3,4],ymm5[5,6],ymm6[7],ymm5[8],ymm6[9,10,11,12],ymm5[13,14],ymm6[15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,1,0,3] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4],xmm6[5,6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm2 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm6 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm12[4],xmm6[5,6],xmm12[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,3,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm13, %xmm0 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7],ymm5[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = mem[0,3,2,3,4,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3,4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9,10,11,12],ymm0[13,14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm11[0,1],mem[2],ymm11[3,4],mem[5],ymm11[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm7 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm2[0,1,2,3],xmm4[4],xmm2[5,6],xmm4[7] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm11[4],xmm7[5,6],xmm11[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u> -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendd $219, (%rsp), %ymm2, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm14 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm14[1,2],xmm6[3],xmm14[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2],ymm7[3,4,5,6,7],ymm0[8,9,10],ymm7[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3,4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u> +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm15[1,2,3,4],ymm2[5,6],ymm15[7],ymm2[8],ymm15[9,10,11,12],ymm2[13,14],ymm15[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5,6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9,10,11,12],ymm1[13,14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm2 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6],ymm4[7],ymm7[8],ymm4[9,10,11,12],ymm7[13,14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5,6],xmm9[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm13 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2],xmm12[3],xmm13[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4],ymm6[5,6],ymm1[7],ymm6[8],ymm1[9,10,11,12],ymm6[13,14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5,6],ymm11[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3],xmm2[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3],mem[4],ymm15[5,6],mem[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3],xmm6[4],xmm8[5],xmm6[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6],ymm8[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm13, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm8, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm6, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-FAST-NEXT: addq $568, %rsp # imm = 0x238 +; AVX2-FAST-NEXT: addq $552, %rsp # imm = 0x228 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll @@ -156,34 +156,34 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind { ; SSE-LABEL: load_i32_stride2_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm6 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movaps 48(%rdi), %xmm9 -; SSE-NEXT: movaps 80(%rdi), %xmm10 +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps 48(%rdi), %xmm3 +; SSE-NEXT: movaps 80(%rdi), %xmm4 ; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps 112(%rdi), %xmm11 +; SSE-NEXT: movaps 112(%rdi), %xmm6 ; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2] -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm9[0,2] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm11[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm10[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm9[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm8[1,3] -; SSE-NEXT: movaps %xmm3, 32(%rsi) -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm6[0,2] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm4[0,2] +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm3[0,2] +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm6[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: movaps %xmm9, 32(%rsi) +; SSE-NEXT: movaps %xmm11, (%rsi) +; SSE-NEXT: movaps %xmm8, 48(%rsi) +; SSE-NEXT: movaps %xmm10, 16(%rsi) ; SSE-NEXT: movaps %xmm5, 32(%rdx) -; SSE-NEXT: movaps %xmm6, (%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps %xmm7, 48(%rdx) -; SSE-NEXT: movaps %xmm4, 16(%rdx) +; SSE-NEXT: movaps %xmm2, 16(%rdx) ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i32_stride2_vf16: @@ -252,8 +252,8 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind { ; SSE-LABEL: load_i32_stride2_vf32: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm9 -; SSE-NEXT: movaps 32(%rdi), %xmm14 +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps 48(%rdi), %xmm8 ; SSE-NEXT: movaps 208(%rdi), %xmm10 ; SSE-NEXT: movaps 192(%rdi), %xmm2 @@ -263,18 +263,18 @@ ; SSE-NEXT: movaps 64(%rdi), %xmm6 ; SSE-NEXT: movaps 240(%rdi), %xmm13 ; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps 176(%rdi), %xmm15 +; SSE-NEXT: movaps 176(%rdi), %xmm14 ; SSE-NEXT: movaps 160(%rdi), %xmm5 -; SSE-NEXT: movaps 112(%rdi), %xmm1 +; SSE-NEXT: movaps 112(%rdi), %xmm15 ; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm1[1,3] -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm15[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm15[1,3] -; SSE-NEXT: movaps %xmm4, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm13[0,2] +; SSE-NEXT: movaps %xmm7, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm15[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm15[1,3] +; SSE-NEXT: movaps %xmm5, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm14[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm14[1,3] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm13[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm13[1,3] ; SSE-NEXT: movaps %xmm2, %xmm13 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm10[0,2] @@ -285,23 +285,23 @@ ; SSE-NEXT: movaps %xmm6, %xmm11 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm12[1,3] -; SSE-NEXT: movaps %xmm14, %xmm12 +; SSE-NEXT: movaps %xmm1, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm8[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,3],xmm8[1,3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm8[1,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm9, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm8[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm8[1,3] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm8[1,3] ; SSE-NEXT: movaps %xmm13, 96(%rsi) ; SSE-NEXT: movaps %xmm10, 64(%rsi) ; SSE-NEXT: movaps %xmm11, 32(%rsi) -; SSE-NEXT: movaps %xmm14, (%rsi) -; SSE-NEXT: movaps %xmm15, 112(%rsi) -; SSE-NEXT: movaps %xmm1, 80(%rsi) -; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm14, 112(%rsi) +; SSE-NEXT: movaps %xmm15, 80(%rsi) +; SSE-NEXT: movaps %xmm9, 48(%rsi) ; SSE-NEXT: movaps %xmm12, 16(%rsi) -; SSE-NEXT: movaps %xmm9, (%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps %xmm6, 32(%rdx) ; SSE-NEXT: movaps %xmm3, 64(%rdx) ; SSE-NEXT: movaps %xmm2, 96(%rdx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -170,40 +170,40 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps 80(%rdi), %xmm8 -; SSE-NEXT: movaps 64(%rdi), %xmm3 +; SSE-NEXT: movaps 80(%rdi), %xmm0 +; SSE-NEXT: movaps 64(%rdi), %xmm4 ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movaps 16(%rdi), %xmm6 -; SSE-NEXT: movaps 32(%rdi), %xmm10 +; SSE-NEXT: movaps 32(%rdi), %xmm3 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm6[0,2] ; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm3[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm8[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm10[2,3] +; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm3[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,3] ; SSE-NEXT: movaps %xmm6, 16(%rsi) -; SSE-NEXT: movaps %xmm11, (%rsi) +; SSE-NEXT: movaps %xmm5, (%rsi) ; SSE-NEXT: movaps %xmm2, 16(%rdx) ; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps %xmm5, 16(%rcx) -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm11, 16(%rcx) +; SSE-NEXT: movaps %xmm8, (%rcx) ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i32_stride3_vf8: @@ -358,94 +358,93 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movaps 96(%rdi), %xmm14 -; SSE-NEXT: movaps 128(%rdi), %xmm11 -; SSE-NEXT: movaps 112(%rdi), %xmm12 -; SSE-NEXT: movaps 144(%rdi), %xmm3 -; SSE-NEXT: movaps 176(%rdi), %xmm13 -; SSE-NEXT: movaps 160(%rdi), %xmm5 -; SSE-NEXT: movaps (%rdi), %xmm15 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm10 -; SSE-NEXT: movaps 80(%rdi), %xmm9 +; SSE-NEXT: movaps 96(%rdi), %xmm15 +; SSE-NEXT: movaps 128(%rdi), %xmm8 +; SSE-NEXT: movaps 112(%rdi), %xmm9 +; SSE-NEXT: movaps 144(%rdi), %xmm1 +; SSE-NEXT: movaps 176(%rdi), %xmm7 +; SSE-NEXT: movaps 160(%rdi), %xmm11 +; SSE-NEXT: movaps (%rdi), %xmm10 +; SSE-NEXT: movaps 16(%rdi), %xmm3 +; SSE-NEXT: movaps 32(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps 80(%rdi), %xmm14 ; SSE-NEXT: movaps 64(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[1,0] -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[1,0] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[1,0] -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm7 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[1,0] +; SSE-NEXT: movaps %xmm1, %xmm6 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[1,0] ; SSE-NEXT: movaps %xmm10, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[1,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm13 +; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm2[0,0] ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm9[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[0,0] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm12[0,0] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm11[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm14[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm11[0,0] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm9[0,0] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm8[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm8[0,0] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm7[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm3[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm13[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm3[0,0] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm8[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3] +; SSE-NEXT: movaps %xmm13, 32(%rsi) +; SSE-NEXT: movaps %xmm6, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rsi) ; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps %xmm15, (%rdx) -; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps %xmm6, 16(%rdx) -; SSE-NEXT: movaps %xmm7, 32(%rcx) +; SSE-NEXT: movaps %xmm10, (%rdx) +; SSE-NEXT: movaps %xmm15, 48(%rdx) +; SSE-NEXT: movaps %xmm12, 16(%rdx) +; SSE-NEXT: movaps %xmm4, 32(%rcx) ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps %xmm5, 48(%rcx) +; SSE-NEXT: movaps %xmm8, 48(%rcx) ; SSE-NEXT: movaps %xmm2, 16(%rcx) ; SSE-NEXT: retq ; @@ -691,62 +690,61 @@ ; SSE-LABEL: load_i32_stride3_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $344, %rsp # imm = 0x158 -; SSE-NEXT: movaps 336(%rdi), %xmm1 -; SSE-NEXT: movaps 368(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 336(%rdi), %xmm10 +; SSE-NEXT: movaps 368(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 352(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm15 -; SSE-NEXT: movaps 272(%rdi), %xmm13 -; SSE-NEXT: movaps 256(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm3 -; SSE-NEXT: movaps 176(%rdi), %xmm12 -; SSE-NEXT: movaps 160(%rdi), %xmm10 -; SSE-NEXT: movaps 48(%rdi), %xmm5 -; SSE-NEXT: movaps 80(%rdi), %xmm6 -; SSE-NEXT: movaps 64(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[1,0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%rdi), %xmm6 +; SSE-NEXT: movaps 272(%rdi), %xmm5 +; SSE-NEXT: movaps 256(%rdi), %xmm11 +; SSE-NEXT: movaps 144(%rdi), %xmm15 +; SSE-NEXT: movaps 176(%rdi), %xmm8 +; SSE-NEXT: movaps 160(%rdi), %xmm13 +; SSE-NEXT: movaps 48(%rdi), %xmm3 +; SSE-NEXT: movaps 80(%rdi), %xmm4 +; SSE-NEXT: movaps 64(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[1,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[1,0] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[1,0] ; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: movaps %xmm15, %xmm4 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0] +; SSE-NEXT: movaps %xmm5, %xmm12 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[1,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm7 +; SSE-NEXT: movaps 16(%rdi), %xmm5 ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm2 +; SSE-NEXT: movaps (%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] ; SSE-NEXT: movaps 96(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -754,151 +752,150 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 224(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps 208(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] ; SSE-NEXT: movaps 192(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 320(%rdi), %xmm14 ; SSE-NEXT: movaps 304(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[1,0] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 288(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm8[0,0] -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm6[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm7[0,0] +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm13[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm8[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm13 +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm0[0,0] ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm10[0,0] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm14[0,0] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: movaps (%rsp), %xmm8 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm8[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0] -; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm0[0,0] ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm7[0,0] -; SSE-NEXT: movaps %xmm7, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm4[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm14[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm5[0,0] +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm14[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm4[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[0,1],mem[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm8[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[0,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[0,1],mem[0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm15[0,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0,1],mem[0,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[0,1],mem[0,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[0,1],mem[0,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 16(%rsi) -; SSE-NEXT: movaps %xmm1, 96(%rdx) -; SSE-NEXT: movaps %xmm3, 64(%rdx) -; SSE-NEXT: movaps %xmm10, 32(%rdx) -; SSE-NEXT: movaps %xmm11, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 16(%rsi) +; SSE-NEXT: movaps %xmm3, 96(%rdx) +; SSE-NEXT: movaps %xmm6, 64(%rdx) +; SSE-NEXT: movaps %xmm11, 32(%rdx) +; SSE-NEXT: movaps %xmm9, (%rdx) ; SSE-NEXT: movaps %xmm12, 112(%rdx) -; SSE-NEXT: movaps %xmm15, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps %xmm13, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rdx) ; SSE-NEXT: movaps %xmm0, 96(%rcx) -; SSE-NEXT: movaps %xmm7, 112(%rcx) -; SSE-NEXT: movaps %xmm9, 64(%rcx) -; SSE-NEXT: movaps %xmm4, 80(%rcx) -; SSE-NEXT: movaps %xmm14, 32(%rcx) -; SSE-NEXT: movaps %xmm13, 48(%rcx) +; SSE-NEXT: movaps %xmm1, 112(%rcx) +; SSE-NEXT: movaps %xmm10, 64(%rcx) +; SSE-NEXT: movaps %xmm14, 80(%rcx) +; SSE-NEXT: movaps %xmm5, 32(%rcx) +; SSE-NEXT: movaps %xmm4, 48(%rcx) ; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movaps %xmm5, 16(%rcx) +; SSE-NEXT: movaps %xmm7, 16(%rcx) ; SSE-NEXT: addq $344, %rsp # imm = 0x158 ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -196,97 +196,97 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i32_stride4_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm5 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm6 -; SSE-NEXT: movaps 48(%rdi), %xmm9 -; SSE-NEXT: movaps 80(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rdi), %xmm4 -; SSE-NEXT: movaps 112(%rdi), %xmm11 -; SSE-NEXT: movaps 96(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm7[0] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm3 +; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps 80(%rdi), %xmm5 +; SSE-NEXT: movaps 64(%rdi), %xmm1 +; SSE-NEXT: movaps 112(%rdi), %xmm6 +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; SSE-NEXT: movaps %xmm12, 16(%rsi) +; SSE-NEXT: movaps %xmm10, 16(%rsi) ; SSE-NEXT: movaps %xmm13, (%rsi) -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps %xmm7, (%rcx) -; SSE-NEXT: movaps %xmm4, 16(%r8) -; SSE-NEXT: movaps %xmm5, (%r8) +; SSE-NEXT: movaps %xmm9, 16(%rdx) +; SSE-NEXT: movaps %xmm12, (%rdx) +; SSE-NEXT: movaps %xmm5, 16(%rcx) +; SSE-NEXT: movaps %xmm3, (%rcx) +; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i32_stride4_vf8: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,0,1] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX1-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5] -; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4] ; AVX1-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX1-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm6[0],xmm5[0] -; AVX1-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,0] -; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX1-NEXT: vunpcklps {{.*#+}} ymm8 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[4],ymm1[4],ymm10[5],ymm1[5] -; AVX1-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,0],ymm4[1,0],ymm0[5,4],ymm4[5,4] -; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm9[2,0],ymm8[2,3],ymm9[6,4],ymm8[6,7] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[1],xmm7[1],zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] -; AVX1-NEXT: vunpckhps {{.*#+}} ymm9 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7] -; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4] -; AVX1-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm5[2],xmm6[2] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] +; AVX1-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm9 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,0] +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm4[1,0],ymm0[5,4],ymm4[5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,0],ymm7[2,3],ymm10[6,4],ymm7[6,7] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX1-NEXT: vinsertps {{.*#+}} xmm11 = xmm8[1],xmm9[1],zero,zero +; AVX1-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX1-NEXT: vunpckhps {{.*#+}} ymm11 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7] +; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4] +; AVX1-NEXT: vinsertps {{.*#+}} xmm11 = zero,zero,xmm5[2],xmm6[2] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[3,0],ymm0[7,4],ymm4[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,3],ymm0[6,4],ymm1[6,7] ; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,0],xmm3[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,0],xmm1[2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm9[3,0],xmm8[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vmovaps %ymm11, (%rsi) -; AVX1-NEXT: vmovaps %ymm12, (%rdx) -; AVX1-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-NEXT: vmovaps %ymm7, (%rdx) +; AVX1-NEXT: vmovaps %ymm10, (%rcx) ; AVX1-NEXT: vmovaps %ymm0, (%r8) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_i32_stride4_vf8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [17179869184,17179869184,17179869184,17179869184] @@ -294,45 +294,45 @@ ; AVX2-NEXT: vpermps %ymm1, %ymm3, %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-NEXT: vmovaps {{.*#+}} xmm5 = -; AVX2-NEXT: vpermps %ymm9, %ymm5, %ymm6 +; AVX2-NEXT: vpermps %ymm4, %ymm5, %ymm6 ; AVX2-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX2-NEXT: vmovaps 16(%rdi), %xmm8 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [21474836481,21474836481,21474836481,21474836481] -; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm6 -; AVX2-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [21474836481,21474836481,21474836481,21474836481] +; AVX2-NEXT: vpermps %ymm2, %ymm6, %ymm9 +; AVX2-NEXT: vpermps %ymm1, %ymm6, %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vmovaps 48(%rdi), %xmm9 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] ; AVX2-NEXT: vmovaps {{.*#+}} xmm11 = <1,5,u,u> -; AVX2-NEXT: vpermps %ymm8, %ymm11, %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [25769803778,25769803778,25769803778,25769803778] -; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm11 -; AVX2-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vpermps %ymm0, %ymm11, %ymm11 +; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm10 = [25769803778,25769803778,25769803778,25769803778] +; AVX2-NEXT: vpermps %ymm2, %ymm10, %ymm11 +; AVX2-NEXT: vpermps %ymm1, %ymm10, %ymm10 +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-NEXT: vmovaps {{.*#+}} xmm11 = -; AVX2-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [30064771075,30064771075,30064771075,30064771075] -; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpermps %ymm4, %ymm11, %ymm4 +; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm7 = [30064771075,30064771075,30064771075,30064771075] +; AVX2-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-NEXT: vpermps %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-NEXT: vmovaps {{.*#+}} xmm4 = <3,7,u,u> -; AVX2-NEXT: vpermps %ymm8, %ymm4, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps %ymm10, (%rsi) -; AVX2-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-NEXT: vmovaps %ymm1, (%r8) +; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX2-NEXT: vmovaps {{.*#+}} xmm5 = <3,7,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-NEXT: vmovaps %ymm0, (%r8) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -372,223 +372,230 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i32_stride4_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movaps 208(%rdi), %xmm10 +; SSE-NEXT: subq $40, %rsp +; SSE-NEXT: movaps 208(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 224(%rdi), %xmm2 -; SSE-NEXT: movaps 80(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm9 +; SSE-NEXT: movaps 80(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm5 ; SSE-NEXT: movaps 112(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm12 -; SSE-NEXT: movaps 176(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm12, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] -; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movaps 128(%rdi), %xmm9 +; SSE-NEXT: movaps 176(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movaps %xmm9, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm9, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movaps 192(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movaps 192(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movaps 48(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movaps %xmm2, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps 32(%rdi), %xmm3 +; SSE-NEXT: movaps 48(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; SSE-NEXT: movaps (%rdi), %xmm10 +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] +; SSE-NEXT: movaps %xmm13, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm7[1] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm12, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movaps %xmm12, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm8[1] -; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm15, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) -; SSE-NEXT: movaps %xmm14, (%rsi) +; SSE-NEXT: movaps %xmm3, 48(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 32(%rsi) +; SSE-NEXT: movaps %xmm11, (%rsi) ; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps %xmm7, 16(%rdx) -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps %xmm11, 32(%rdx) -; SSE-NEXT: movaps %xmm8, 16(%rcx) -; SSE-NEXT: movaps %xmm3, 48(%rcx) -; SSE-NEXT: movaps %xmm10, 32(%rcx) -; SSE-NEXT: movaps %xmm13, (%rcx) -; SSE-NEXT: movaps %xmm5, 48(%r8) -; SSE-NEXT: movaps %xmm9, 16(%r8) -; SSE-NEXT: movaps %xmm12, 32(%r8) -; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: addq $24, %rsp +; SSE-NEXT: movaps %xmm6, 16(%rdx) +; SSE-NEXT: movaps %xmm13, (%rdx) +; SSE-NEXT: movaps %xmm15, 32(%rdx) +; SSE-NEXT: movaps %xmm7, 16(%rcx) +; SSE-NEXT: movaps %xmm8, 48(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm12, 48(%r8) +; SSE-NEXT: movaps %xmm5, 16(%r8) +; SSE-NEXT: movaps %xmm9, 32(%r8) +; SSE-NEXT: movaps %xmm10, (%r8) +; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i32_stride4_vf16: ; AVX1: # %bb.0: -; AVX1-NEXT: subq $312, %rsp # imm = 0x138 +; AVX1-NEXT: subq $264, %rsp # imm = 0x108 ; AVX1-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX1-NEXT: vmovaps 96(%rdi), %ymm8 +; AVX1-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX1-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX1-NEXT: vmovaps 224(%rdi), %ymm15 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm15[2,3,0,1] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] -; AVX1-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[4],ymm2[4],ymm7[5],ymm2[5] -; AVX1-NEXT: vmovaps %ymm2, %ymm4 -; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX1-NEXT: vmovaps %ymm3, %ymm14 +; AVX1-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovaps %ymm1, %ymm15 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX1-NEXT: vmovaps %ymm2, %ymm10 +; AVX1-NEXT: vmovaps %ymm1, %ymm3 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,0],ymm7[4,5],ymm0[6,4] ; AVX1-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 176(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovaps 144(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,0] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3,0,1] -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1] -; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] -; AVX1-NEXT: vshufps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovaps %xmm1, %xmm14 -; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps (%rdi), %xmm12 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] +; AVX1-NEXT: vmovaps 176(%rdi), %xmm6 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm11 = xmm6[0],xmm1[0] +; AVX1-NEXT: vmovaps %xmm6, %xmm2 +; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 144(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 128(%rdi), %xmm6 ; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,0] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm12 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm15[0],ymm11[1],ymm15[1],ymm11[4],ymm15[4],ymm11[5],ymm15[5] -; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm7[1,0],ymm4[5,4],ymm7[5,4] -; AVX1-NEXT: vmovaps %ymm7, %ymm11 -; AVX1-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm4[1],zero,zero -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} xmm13 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] +; AVX1-NEXT: vmovaps %ymm6, %ymm8 +; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm5[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm13 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[4],ymm5[4],ymm9[5],ymm5[5] +; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm13[0,1],ymm0[2,0],ymm13[4,5],ymm0[6,4] +; AVX1-NEXT: vmovaps 32(%rdi), %xmm11 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm12 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm11[0] +; AVX1-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 16(%rdi), %xmm5 +; AVX1-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm3[0],ymm8[0],ymm3[1],ymm8[1],ymm3[4],ymm8[4],ymm3[5],ymm8[5] -; AVX1-NEXT: vmovaps %ymm5, %ymm2 -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm1[1,0],ymm5[5,4],ymm1[5,4] -; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-NEXT: vmovaps %ymm15, %ymm4 +; AVX1-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vunpcklps {{.*#+}} ymm7 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[4],ymm14[4],ymm15[5],ymm14[5] +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm10[1,0],ymm3[1,0],ymm10[5,4],ymm3[5,4] +; AVX1-NEXT: vmovaps %ymm10, %ymm15 +; AVX1-NEXT: vmovaps %ymm3, %ymm13 +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm7[2,3],ymm6[6,4],ymm7[6,7] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] -; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm12[1],xmm6[1],zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm14[2],ymm11[3],ymm14[3],ymm11[6],ymm14[6],ymm11[7],ymm14[7] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,0],ymm6[4,5],ymm0[6,4] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; AVX1-NEXT: vmovaps %xmm9, %xmm10 -; AVX1-NEXT: vmovaps %xmm7, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm7[2],xmm9[2] -; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] -; AVX1-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX1-NEXT: vmovaps %ymm2, %ymm9 -; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] +; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[1],xmm1[1],zero,zero +; AVX1-NEXT: vmovaps %xmm1, %xmm14 +; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm1[2],xmm5[2] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm8, %ymm10 +; AVX1-NEXT: vunpcklps {{.*#+}} ymm3 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[4],ymm0[4],ymm8[5],ymm0[5] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm2[1,0],ymm9[1,0],ymm2[5,4],ymm9[5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm3[2,3],ymm6[6,4],ymm3[6,7] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm7 # 16-byte Folded Reload +; AVX1-NEXT: # xmm7 = mem[0],xmm8[1],zero,zero +; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7] +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1],ymm3[2,0],ymm6[4,5],ymm3[6,4] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],xmm14[2],xmm5[3],xmm14[3] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm1[2],xmm3[2] +; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovaps %ymm10, %ymm14 +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] +; AVX1-NEXT: vmovaps %ymm0, %ymm5 +; AVX1-NEXT: vunpckhps {{.*#+}} ymm7 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] +; AVX1-NEXT: vmovaps %xmm11, %xmm10 +; AVX1-NEXT: vmovaps %xmm12, %xmm11 +; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm10[2],xmm12[2] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm8[2],xmm12[3],xmm8[3] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7] -; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX1-NEXT: # ymm2 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4] -; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm6[2,3],ymm2[6,4],ymm6[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,0],xmm11[3,0] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-NEXT: vunpckhps (%rsp), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX1-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0],ymm13[3,0],ymm15[7,4],ymm13[7,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[2,3],ymm2[6,4],ymm4[6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX1-NEXT: # xmm6 = xmm1[3,0],mem[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,0],xmm4[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] -; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX1-NEXT: # ymm6 = ymm9[3,0],mem[3,0],ymm9[7,4],mem[7,4] -; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,0],ymm4[2,3],ymm6[6,4],ymm4[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,0],xmm12[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} ymm3 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,0],ymm9[3,0],ymm1[7,4],ymm9[7,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm8[3,0],xmm12[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -602,95 +609,95 @@ ; AVX1-NEXT: vmovaps %ymm0, (%rcx) ; AVX1-NEXT: vmovaps %ymm2, 32(%r8) ; AVX1-NEXT: vmovaps %ymm1, (%r8) -; AVX1-NEXT: addq $312, %rsp # imm = 0x138 +; AVX1-NEXT: addq $264, %rsp # imm = 0x108 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_i32_stride4_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $104, %rsp -; AVX2-NEXT: vmovaps (%rdi), %ymm13 -; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdi), %ymm9 -; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 32(%rdi), %ymm7 +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm5 -; AVX2-NEXT: vmovaps 160(%rdi), %ymm8 -; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 160(%rdi), %ymm12 +; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [17179869184,17179869184,17179869184,17179869184] ; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm1 ; AVX2-NEXT: vpermps %ymm3, %ymm0, %ymm6 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovaps 144(%rdi), %xmm15 -; AVX2-NEXT: vmovaps 128(%rdi), %xmm10 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] -; AVX2-NEXT: vmovaps {{.*#+}} xmm7 = -; AVX2-NEXT: vpermps %ymm8, %ymm7, %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] +; AVX2-NEXT: vmovaps 144(%rdi), %xmm10 +; AVX2-NEXT: vmovaps 128(%rdi), %xmm11 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-NEXT: vmovaps {{.*#+}} xmm9 = +; AVX2-NEXT: vpermps %ymm12, %ymm9, %ymm12 +; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps %ymm5, %ymm0, %ymm1 ; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vpermps %ymm9, %ymm7, %ymm1 -; AVX2-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-NEXT: vmovaps 16(%rdi), %xmm8 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-NEXT: vpermps %ymm7, %ymm9, %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %xmm12 +; AVX2-NEXT: vmovaps 16(%rdi), %xmm13 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [21474836481,21474836481,21474836481,21474836481] ; AVX2-NEXT: vpermps %ymm5, %ymm0, %ymm1 ; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovaps 32(%rdi), %xmm15 ; AVX2-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; AVX2-NEXT: vmovaps {{.*#+}} xmm11 = <1,5,u,u> -; AVX2-NEXT: vpermps %ymm13, %ymm11, %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] +; AVX2-NEXT: vmovaps {{.*#+}} xmm7 = <1,5,u,u> +; AVX2-NEXT: vpermps %ymm8, %ymm7, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm1 ; AVX2-NEXT: vpermps %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 128(%rdi), %ymm14 -; AVX2-NEXT: vpermps %ymm14, %ymm11, %ymm11 +; AVX2-NEXT: vpermps %ymm14, %ymm7, %ymm7 ; AVX2-NEXT: vmovaps 176(%rdi), %xmm1 ; AVX2-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm11 = [25769803778,25769803778,25769803778,25769803778] -; AVX2-NEXT: vpermps %ymm2, %ymm11, %ymm13 -; AVX2-NEXT: vpermps %ymm3, %ymm11, %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm10 = xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; AVX2-NEXT: vmovaps {{.*#+}} xmm13 = -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vpermps %ymm5, %ymm11, %ymm12 -; AVX2-NEXT: vpermps %ymm4, %ymm11, %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm12 # 32-byte Folded Reload -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm12[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm8 = [30064771075,30064771075,30064771075,30064771075] -; AVX2-NEXT: vpermps %ymm5, %ymm8, %ymm5 -; AVX2-NEXT: vpermps %ymm4, %ymm8, %ymm4 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm7 = [25769803778,25769803778,25769803778,25769803778] +; AVX2-NEXT: vpermps %ymm2, %ymm7, %ymm8 +; AVX2-NEXT: vpermps %ymm3, %ymm7, %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-NEXT: vmovaps {{.*#+}} xmm10 = +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpermps %ymm5, %ymm7, %ymm9 +; AVX2-NEXT: vpermps %ymm4, %ymm7, %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm9 = [30064771075,30064771075,30064771075,30064771075] +; AVX2-NEXT: vpermps %ymm5, %ymm9, %ymm5 +; AVX2-NEXT: vpermps %ymm4, %ymm9, %ymm4 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] ; AVX2-NEXT: vmovaps {{.*#+}} xmm6 = <3,7,u,u> -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vpermps %ymm2, %ymm8, %ymm2 -; AVX2-NEXT: vpermps %ymm3, %ymm8, %ymm3 +; AVX2-NEXT: vpermps %ymm2, %ymm9, %ymm2 +; AVX2-NEXT: vpermps %ymm3, %ymm9, %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vpermps %ymm14, %ymm6, %ymm1 @@ -704,7 +711,7 @@ ; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX2-NEXT: vmovaps %ymm8, 32(%rcx) ; AVX2-NEXT: vmovaps %ymm7, (%rcx) ; AVX2-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-NEXT: vmovaps %ymm4, (%r8) @@ -766,59 +773,60 @@ ; SSE-LABEL: load_i32_stride4_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 -; SSE-NEXT: movaps 336(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm9 -; SSE-NEXT: movaps 352(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm5 +; SSE-NEXT: movaps 336(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm6 +; SSE-NEXT: movaps 368(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm14 -; SSE-NEXT: movaps 80(%rdi), %xmm4 +; SSE-NEXT: movaps 352(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm12 +; SSE-NEXT: movaps 80(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm11 +; SSE-NEXT: movaps 112(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movaps 96(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movaps %xmm8, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movaps %xmm9, %xmm7 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movaps 320(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 496(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 480(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 464(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 448(%rdi), %xmm1 @@ -848,18 +856,18 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 304(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps 288(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 272(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 432(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 416(%rdi), %xmm10 @@ -894,33 +902,34 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm12 +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload @@ -948,18 +957,16 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm14[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm14 -; SSE-NEXT: movaps %xmm7, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm12[1] +; SSE-NEXT: movaps %xmm13, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm9[1] ; SSE-NEXT: movaps %xmm15, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm13[1] +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm7[1] ; SSE-NEXT: movaps %xmm8, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] @@ -983,7 +990,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) ; SSE-NEXT: movaps %xmm3, 96(%rdx) -; SSE-NEXT: movaps %xmm12, 64(%rdx) +; SSE-NEXT: movaps %xmm11, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rdx) ; SSE-NEXT: movaps %xmm4, (%rdx) @@ -997,7 +1004,7 @@ ; SSE-NEXT: movaps %xmm2, 16(%rdx) ; SSE-NEXT: movaps %xmm7, 96(%rcx) ; SSE-NEXT: movaps %xmm9, 64(%rcx) -; SSE-NEXT: movaps %xmm11, 32(%rcx) +; SSE-NEXT: movaps %xmm12, 32(%rcx) ; SSE-NEXT: movaps %xmm6, (%rcx) ; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movaps %xmm1, 80(%rcx) @@ -1007,10 +1014,9 @@ ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps %xmm8, 96(%r8) ; SSE-NEXT: movaps %xmm15, 64(%r8) -; SSE-NEXT: movaps %xmm14, 32(%r8) +; SSE-NEXT: movaps %xmm13, 32(%r8) ; SSE-NEXT: movaps %xmm5, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%r8) +; SSE-NEXT: movaps %xmm14, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1042,7 +1048,7 @@ ; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vmovaps 400(%rdi), %xmm5 -; AVX1-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill ; AVX1-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] @@ -1054,229 +1060,227 @@ ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX1-NEXT: vmovaps %ymm1, %ymm6 ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps %ymm3, %ymm5 +; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-NEXT: vmovaps 288(%rdi), %xmm2 ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vmovaps 272(%rdi), %xmm3 -; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-NEXT: vmovaps 272(%rdi), %xmm4 +; AVX1-NEXT: vmovaps 256(%rdi), %xmm5 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-NEXT: vmovaps %xmm5, %xmm8 +; AVX1-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX1-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX1-NEXT: vmovaps %ymm3, %ymm13 -; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps %ymm1, %ymm12 -; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX1-NEXT: vunpcklps {{.*#+}} ymm3 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] +; AVX1-NEXT: vmovaps 224(%rdi), %ymm7 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm7[2,3,0,1] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] ; AVX1-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps %ymm0, %ymm15 +; AVX1-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-NEXT: vmovaps %ymm1, %ymm13 +; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovaps %ymm0, %ymm12 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] ; AVX1-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 176(%rdi), %xmm1 -; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovaps 144(%rdi), %xmm4 -; AVX1-NEXT: vmovaps 128(%rdi), %xmm8 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX1-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,0] +; AVX1-NEXT: vmovaps %xmm1, %xmm14 +; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 144(%rdi), %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 128(%rdi), %xmm5 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4] -; AVX1-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 16(%rdi), %xmm3 ; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[4],ymm10[4],ymm6[5],ymm10[5] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm6[1,0],ymm5[5,4],ymm6[5,4] +; AVX1-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX1-NEXT: # ymm1 = ymm6[1,0],mem[1,0],ymm6[5,4],mem[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[1],xmm5[1],zero,zero -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload -; AVX1-NEXT: # xmm7 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[1],xmm4[1],zero,zero +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vunpcklps {{.*#+}} xmm15 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,0],ymm9[1,0],ymm15[5,4],ymm9[5,4] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm13[1,0],ymm12[5,4],ymm13[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[1],xmm4[1],zero,zero -; AVX1-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload -; AVX1-NEXT: # xmm7 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm5[1],xmm7[1],zero,zero +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm2[0],ymm14[1],ymm2[1],ymm14[4],ymm2[4],ymm14[5],ymm2[5] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm12[1,0],ymm3[5,4],ymm12[5,4] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm2[0],ymm10[1],ymm2[1],ymm10[4],ymm2[4],ymm10[5],ymm2[5] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX1-NEXT: # ymm1 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[1],xmm1[1],zero,zero +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm13[1],xmm1[1],zero,zero ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm13[1,0],ymm1[5,4],ymm13[5,4] +; AVX1-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-NEXT: # xmm7 = mem[0],xmm7[1],zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX1-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-NEXT: # xmm15 = mem[0],xmm15[1],zero,zero +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX1-NEXT: # ymm0 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX1-NEXT: # ymm1 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX1-NEXT: # ymm0 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[6],ymm6[6],ymm1[7],ymm6[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload -; AVX1-NEXT: # xmm7 = zero,zero,xmm5[2],mem[0] -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm4[2],xmm8[2] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX1-NEXT: # ymm1 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX1-NEXT: # ymm0 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX1-NEXT: # ymm1 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm5[2],xmm6[2] -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm15 # 16-byte Folded Reload +; AVX1-NEXT: # xmm15 = zero,zero,xmm4[2],mem[0] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] -; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[6],ymm3[6],ymm12[7],ymm3[7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm4[2],xmm9[2] -; AVX1-NEXT: vmovaps %xmm9, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX1-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm3[2],xmm9[2] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX1-NEXT: # ymm0 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] -; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX1-NEXT: # ymm1 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm13[2],xmm15[2] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm5[2],xmm12[2] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm15 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm10[3,0],ymm1[7,4],ymm10[7,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm8[3,0],ymm1[7,4],ymm8[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload -; AVX1-NEXT: # xmm4 = xmm11[3,0],mem[3,0] +; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: # xmm1 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,0],xmm11[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,0],xmm1[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-NEXT: # ymm4 = ymm4[3,0],mem[3,0],ymm4[7,4],mem[7,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0],ymm2[3,0],ymm4[7,4],ymm2[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm1[2,3],ymm4[6,4],ymm1[6,7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: # xmm5 = xmm2[3,0],mem[3,0] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm5 # 16-byte Folded Reload +; AVX1-NEXT: # xmm5 = xmm7[3,0],mem[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX1-NEXT: # ymm5 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX1-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-NEXT: # ymm5 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,0],ymm4[2,3],ymm5[6,4],ymm4[6,7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX1-NEXT: # xmm5 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload ; AVX1-NEXT: # xmm6 = xmm2[3,0],mem[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,0],xmm5[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} ymm2 = ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[6],ymm14[6],ymm12[7],ymm14[7] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-NEXT: # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm14[3,0],ymm13[3,0],ymm14[7,4],ymm13[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,0],xmm8[3,0] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm10[3,0],xmm9[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,0],xmm3[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -1297,7 +1301,7 @@ ; AVX1-NEXT: vmovaps %ymm3, 64(%rdx) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-NEXT: vmovaps %ymm7, (%rcx) +; AVX1-NEXT: vmovaps %ymm15, (%rcx) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm3, 32(%rcx) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -1312,7 +1316,7 @@ ; ; AVX2-LABEL: load_i32_stride4_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $696, %rsp # imm = 0x2B8 +; AVX2-NEXT: subq $680, %rsp # imm = 0x2A8 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm12 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm5 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm6 @@ -1337,8 +1341,9 @@ ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps %ymm6, %ymm3, %ymm0 +; AVX2-NEXT: vmovaps %ymm6, %ymm7 ; AVX2-NEXT: vpermps %ymm5, %ymm3, %ymm1 -; AVX2-NEXT: vmovaps %ymm5, %ymm9 +; AVX2-NEXT: vmovaps %ymm5, %ymm6 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovaps 272(%rdi), %xmm2 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1375,7 +1380,7 @@ ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm4 ; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1383,12 +1388,12 @@ ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836481,21474836481,21474836481,21474836481] -; AVX2-NEXT: vpermps %ymm6, %ymm3, %ymm4 -; AVX2-NEXT: vmovaps %ymm6, %ymm2 +; AVX2-NEXT: vpermps %ymm7, %ymm3, %ymm4 +; AVX2-NEXT: vmovaps %ymm7, %ymm2 +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps %ymm6, %ymm1 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps %ymm9, %ymm1 -; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm9, %ymm3, %ymm5 +; AVX2-NEXT: vpermps %ymm6, %ymm3, %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1405,29 +1410,28 @@ ; AVX2-NEXT: vpermps %ymm10, %ymm3, %ymm5 ; AVX2-NEXT: vpermps %ymm8, %ymm3, %ymm6 ; AVX2-NEXT: vmovaps %ymm8, %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 176(%rdi), %xmm6 -; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 176(%rdi), %xmm8 +; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 160(%rdi), %xmm5 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm13 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm8 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] ; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm9 -; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm13[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps %ymm11, %ymm3, %ymm6 ; AVX2-NEXT: vpermps %ymm15, %ymm3, %ymm8 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vmovaps (%rdi), %ymm13 -; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX2-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} xmm8 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX2-NEXT: vpermps %ymm13, %ymm4, %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm8[2,3] +; AVX2-NEXT: vmovaps 48(%rdi), %xmm6 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; AVX2-NEXT: vpermps %ymm8, %ymm4, %ymm13 +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps %ymm14, %ymm9 @@ -1479,14 +1483,14 @@ ; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX2-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX2-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [30064771075,30064771075,30064771075,30064771075] -; AVX2-NEXT: vpermps %ymm10, %ymm6, %ymm0 -; AVX2-NEXT: vpermps %ymm7, %ymm6, %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm12 = [30064771075,30064771075,30064771075,30064771075] +; AVX2-NEXT: vpermps %ymm10, %ymm12, %ymm0 +; AVX2-NEXT: vpermps %ymm7, %ymm12, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = xmm5[2],mem[2],xmm5[3],mem[3] @@ -1494,24 +1498,23 @@ ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpermps %ymm11, %ymm6, %ymm0 -; AVX2-NEXT: vpermps %ymm15, %ymm6, %ymm2 +; AVX2-NEXT: vpermps %ymm11, %ymm12, %ymm0 +; AVX2-NEXT: vpermps %ymm15, %ymm12, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpermps %ymm9, %ymm6, %ymm2 -; AVX2-NEXT: vpermps %ymm8, %ymm6, %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vpermps %ymm9, %ymm12, %ymm2 +; AVX2-NEXT: vpermps %ymm8, %ymm12, %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload @@ -1545,7 +1548,7 @@ ; AVX2-NEXT: vmovaps %ymm2, 96(%r8) ; AVX2-NEXT: vmovaps %ymm0, (%r8) ; AVX2-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-NEXT: addq $696, %rsp # imm = 0x2B8 +; AVX2-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -144,53 +144,53 @@ ; SSE-LABEL: load_i32_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 80(%rdi), %xmm9 +; SSE-NEXT: movdqa 80(%rdi), %xmm1 ; SSE-NEXT: movdqa 64(%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa 16(%rdi), %xmm12 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm10[0],xmm3[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm11[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm12[0],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: movapd %xmm1, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm4[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm8[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm2[0],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm9, (%rsi) ; SSE-NEXT: movapd %xmm3, (%rdx) -; SSE-NEXT: movapd %xmm4, (%rcx) -; SSE-NEXT: movapd %xmm6, (%r8) -; SSE-NEXT: movapd %xmm2, (%r9) +; SSE-NEXT: movapd %xmm5, (%rcx) +; SSE-NEXT: movapd %xmm10, (%r8) +; SSE-NEXT: movapd %xmm8, (%r9) ; SSE-NEXT: movapd %xmm0, (%rax) ; SSE-NEXT: retq ; @@ -204,10 +204,10 @@ ; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1],xmm3[2,3] ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,3] ; AVX1-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX1-NEXT: vinsertps {{.*#+}} xmm8 = xmm4[0,1,2],xmm5[2] +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[2] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[1,3] -; AVX1-NEXT: vblendps {{.*#+}} xmm9 = xmm2[0,1,2],xmm5[3] +; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,3] @@ -217,21 +217,21 @@ ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[1] ; AVX1-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm1[2,2,3,3] -; AVX1-NEXT: vmovaps 16(%rdi), %xmm4 -; AVX1-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0],xmm7[1],xmm4[2,3] -; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm6[2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,2] -; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX1-NEXT: vmovaps 16(%rdi), %xmm8 +; AVX1-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3] +; AVX1-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1],xmm6[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,0,2] +; AVX1-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7] -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,2,3] ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] -; AVX1-NEXT: vmovaps %xmm8, (%rsi) -; AVX1-NEXT: vmovaps %xmm9, (%rdx) +; AVX1-NEXT: vmovaps %xmm4, (%rsi) +; AVX1-NEXT: vmovaps %xmm2, (%rdx) ; AVX1-NEXT: vmovaps %xmm3, (%rcx) ; AVX1-NEXT: vmovaps %xmm0, (%r8) -; AVX1-NEXT: vmovaps %xmm2, (%r9) +; AVX1-NEXT: vmovaps %xmm7, (%r9) ; AVX1-NEXT: vmovaps %xmm1, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -246,7 +246,7 @@ ; AVX2-NEXT: vpermd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1,2],xmm5[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <1,7,5,u> ; AVX2-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] @@ -256,28 +256,28 @@ ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,3,2,3] ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm7 -; AVX2-NEXT: vpbroadcastd %xmm7, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] +; AVX2-NEXT: vpbroadcastd %xmm7, %xmm8 +; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,3,3,4,5,7,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,3,3,4,5,7,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4],ymm8[5,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,3,2,3] -; AVX2-NEXT: vpbroadcastd 84(%rdi), %xmm6 -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm7[2,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; AVX2-NEXT: vpbroadcastd 84(%rdi), %xmm8 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <4,2,u,u> ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm9, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <5,3,u,u> -; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <5,3,u,u> +; AVX2-NEXT: vpermd %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-NEXT: vmovdqa %xmm8, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) ; AVX2-NEXT: vmovdqa %xmm3, (%rdx) -; AVX2-NEXT: vmovdqa %xmm0, (%rcx) +; AVX2-NEXT: vmovdqa %xmm6, (%rcx) ; AVX2-NEXT: vmovdqa %xmm5, (%r8) ; AVX2-NEXT: vmovdqa %xmm2, (%r9) ; AVX2-NEXT: vmovdqa %xmm1, (%rax) @@ -286,52 +286,52 @@ ; ; AVX512-LABEL: load_i32_stride6_vf4: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512-NEXT: vpextrd $2, %xmm0, %eax -; AVX512-NEXT: vpinsrd $1, %eax, %xmm2, %xmm4 -; AVX512-NEXT: vmovd %xmm3, %eax -; AVX512-NEXT: vpinsrd $2, %eax, %xmm4, %xmm4 +; AVX512-NEXT: vpextrd $2, %xmm0, %r10d +; AVX512-NEXT: vpinsrd $1, %r10d, %xmm2, %xmm4 +; AVX512-NEXT: vmovd %xmm3, %r10d +; AVX512-NEXT: vpinsrd $2, %r10d, %xmm4, %xmm4 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512-NEXT: vpextrd $2, %xmm5, %eax -; AVX512-NEXT: vpinsrd $3, %eax, %xmm4, %xmm8 -; AVX512-NEXT: vpextrd $1, %xmm3, %eax +; AVX512-NEXT: vpextrd $2, %xmm5, %r10d +; AVX512-NEXT: vpinsrd $3, %r10d, %xmm4, %xmm4 +; AVX512-NEXT: vpextrd $1, %xmm3, %r10d ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm0[2,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] -; AVX512-NEXT: vpinsrd $2, %eax, %xmm6, %xmm6 +; AVX512-NEXT: vpinsrd $2, %r10d, %xmm6, %xmm6 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3] ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm1[0,1],xmm2[2,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,0,2,3] ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3] -; AVX512-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512-NEXT: vmovd %xmm4, %eax -; AVX512-NEXT: vpinsrd $3, %eax, %xmm7, %xmm7 +; AVX512-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX512-NEXT: vmovd %xmm8, %edi +; AVX512-NEXT: vpinsrd $3, %edi, %xmm7, %xmm7 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX512-NEXT: vpextrd $3, %xmm3, %eax -; AVX512-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX512-NEXT: vpextrd $1, %xmm4, %eax -; AVX512-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 -; AVX512-NEXT: vpextrd $2, %xmm1, %eax -; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm3 -; AVX512-NEXT: vmovd %xmm5, %eax -; AVX512-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vpextrd $2, %xmm4, %eax -; AVX512-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vpextrd $1, %xmm5, %eax +; AVX512-NEXT: vpextrd $3, %xmm3, %edi +; AVX512-NEXT: vpinsrd $2, %edi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrd $1, %xmm8, %edi +; AVX512-NEXT: vpinsrd $3, %edi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrd $2, %xmm1, %edi +; AVX512-NEXT: vpinsrd $1, %edi, %xmm0, %xmm3 +; AVX512-NEXT: vmovd %xmm5, %edi +; AVX512-NEXT: vpinsrd $2, %edi, %xmm3, %xmm3 +; AVX512-NEXT: vpextrd $2, %xmm8, %edi +; AVX512-NEXT: vpinsrd $3, %edi, %xmm3, %xmm3 +; AVX512-NEXT: vpextrd $1, %xmm5, %edi ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] -; AVX512-NEXT: vmovdqa %xmm8, (%rsi) +; AVX512-NEXT: vpinsrd $2, %edi, %xmm0, %xmm0 +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3] +; AVX512-NEXT: vmovdqa %xmm4, (%rsi) ; AVX512-NEXT: vmovdqa %xmm6, (%rdx) ; AVX512-NEXT: vmovdqa %xmm7, (%rcx) ; AVX512-NEXT: vmovdqa %xmm2, (%r8) ; AVX512-NEXT: vmovdqa %xmm3, (%r9) -; AVX512-NEXT: vmovdqa %xmm0, (%r10) +; AVX512-NEXT: vmovdqa %xmm0, (%rax) ; AVX512-NEXT: retq %wide.vec = load <24 x i32>, ptr %in.vec, align 32 @@ -355,200 +355,198 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 144(%rdi), %xmm10 -; SSE-NEXT: movdqa 160(%rdi), %xmm14 -; SSE-NEXT: movdqa 96(%rdi), %xmm11 -; SSE-NEXT: movdqa 112(%rdi), %xmm5 -; SSE-NEXT: movdqa 64(%rdi), %xmm7 -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: movdqa 144(%rdi), %xmm12 +; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa 64(%rdi), %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm7 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm8[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: movdqa 80(%rdi), %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm14[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm13[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm10[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm9 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movdqa 80(%rdi), %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 176(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm6[0],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1] -; SSE-NEXT: movapd %xmm15, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movapd %xmm9, 16(%rdx) -; SSE-NEXT: movapd %xmm13, (%rdx) -; SSE-NEXT: movapd %xmm5, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movapd %xmm11, 16(%r8) -; SSE-NEXT: movapd %xmm12, (%r8) +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] +; SSE-NEXT: movapd %xmm10, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movapd %xmm12, 16(%rdx) +; SSE-NEXT: movapd %xmm11, (%rdx) +; SSE-NEXT: movapd %xmm13, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movapd %xmm4, 16(%r8) +; SSE-NEXT: movapd %xmm7, (%r8) ; SSE-NEXT: movapd %xmm0, 16(%r9) -; SSE-NEXT: movapd %xmm1, (%r9) +; SSE-NEXT: movapd %xmm5, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm14, 16(%rax) -; SSE-NEXT: movapd %xmm7, (%rax) +; SSE-NEXT: movapd %xmm9, 16(%rax) +; SSE-NEXT: movapd %xmm6, (%rax) ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i32_stride6_vf8: ; AVX1: # %bb.0: ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX1-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX1-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX1-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX1-NEXT: vmovaps 32(%rdi), %ymm6 ; AVX1-NEXT: vmovaps (%rdi), %ymm7 ; AVX1-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX1-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm5 ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,0],ymm5[0,0],ymm0[6,4],ymm5[4,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm5[2,2],ymm2[6,4],ymm5[6,6] -; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm9[4,5],ymm7[6,7] -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm6 -; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm6[2,3] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5],ymm3[6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm10[4,5],ymm4[6,7] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm3[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm11[2,0],ymm3[0,0],ymm11[6,4],ymm3[4,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] +; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm9 +; AVX1-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1],xmm9[2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5],ymm10[6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm10[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm11[2,0],ymm10[0,0],ymm11[6,4],ymm10[4,4] ; AVX1-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] -; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm12[6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] ; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm0[3,0],ymm5[1,0],ymm0[7,4],ymm5[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm12[2,0],ymm5[2,3],ymm12[6,4],ymm5[6,7] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,0],xmm6[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[1,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5],ymm2[6,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm11[3,0],ymm3[1,0],ymm11[7,4],ymm3[5,4] -; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] -; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm4[4,5],ymm10[6,7] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,0],ymm2[2,0],ymm3[4,4],ymm2[6,4] -; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm5 -; AVX1-NEXT: vshufps {{.*#+}} xmm9 = xmm7[2,0],xmm5[2,3] -; AVX1-NEXT: vshufps {{.*#+}} ymm11 = ymm0[2,1],ymm1[2,0],ymm0[6,5],ymm1[6,4] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,0,1] -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3,4],ymm9[5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm6[5,6,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[3,1],ymm3[4,5],ymm2[7,5] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,1],xmm5[3,3] -; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,1],ymm1[2,1],ymm0[7,5],ymm1[6,5] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4],ymm3[5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm10[2,3],ymm4[4,5,6,7] +; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,0],xmm9[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,2],xmm9[1,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5],ymm8[6,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm11[3,0],ymm10[1,0],ymm11[7,4],ymm10[5,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,0],ymm8[2,0],ymm9[4,4],ymm8[6,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm11 +; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,0],xmm11[2,3] +; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm1[2,0],ymm0[6,5],ymm1[6,4] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] +; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4],ymm6[5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm10[5,6,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[3,1],ymm9[4,5],ymm8[7,5] +; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,1],xmm11[3,3] +; AVX1-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1],ymm1[2,1],ymm0[7,5],ymm1[6,5] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,0,1] +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4],ymm7[5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,0],ymm4[0,0],ymm3[6,4],ymm4[4,4] -; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,2],ymm5[2,0],ymm4[4,6],ymm5[6,4] -; AVX1-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[2,2,3,3] -; AVX1-NEXT: vmovaps 16(%rdi), %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm2[0],xmm6[1],xmm2[2,3] -; AVX1-NEXT: vmovapd 80(%rdi), %xmm10 -; AVX1-NEXT: vshufpd {{.*#+}} ymm12 = ymm10[1],ymm1[0],ymm10[2],ymm1[2] -; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm12[2,0],ymm0[4,5],ymm12[6,4] -; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3,4,5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,0],ymm4[0,0],ymm3[6,4],ymm4[4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,2],ymm8[2,0],ymm4[4,6],ymm8[6,4] +; AVX1-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX1-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,2,3,3] +; AVX1-NEXT: vmovaps 16(%rdi), %xmm11 +; AVX1-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] +; AVX1-NEXT: vmovapd 80(%rdi), %xmm12 +; AVX1-NEXT: vshufpd {{.*#+}} ymm13 = ymm12[1],ymm1[0],ymm12[2],ymm1[2] +; AVX1-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,1],ymm13[2,0],ymm0[4,5],ymm13[6,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7] ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm4[1,0],ymm3[7,4],ymm4[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,3],ymm3[2,0],ymm4[4,7],ymm3[6,4] -; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,1],ymm1[1,3],ymm10[7,5],ymm1[5,7] +; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm9[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1],ymm1[1,3],ymm12[7,5],ymm1[5,7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[2,0],ymm0[5,5],ymm1[6,4] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX1-NEXT: vmovaps %ymm13, (%rsi) -; AVX1-NEXT: vmovaps %ymm8, (%rdx) -; AVX1-NEXT: vmovaps %ymm9, (%rcx) -; AVX1-NEXT: vmovaps %ymm11, (%r8) -; AVX1-NEXT: vmovaps %ymm5, (%r9) +; AVX1-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-NEXT: vmovaps %ymm5, (%rdx) +; AVX1-NEXT: vmovaps %ymm6, (%rcx) +; AVX1-NEXT: vmovaps %ymm7, (%r8) +; AVX1-NEXT: vmovaps %ymm8, (%r9) ; AVX1-NEXT: vmovaps %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -879,247 +877,242 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $376, %rsp # imm = 0x178 -; SSE-NEXT: movdqa 144(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm8 -; SSE-NEXT: movdqa 96(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm10 -; SSE-NEXT: movdqa 240(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm7 -; SSE-NEXT: movdqa 192(%rdi), %xmm0 +; SSE-NEXT: subq $360, %rsp # imm = 0x168 +; SSE-NEXT: movdqa 144(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm4 +; SSE-NEXT: movdqa 96(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm5 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm6 -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa 256(%rdi), %xmm7 +; SSE-NEXT: movdqa 192(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm14 +; SSE-NEXT: movdqa 64(%rdi), %xmm12 +; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa 16(%rdi), %xmm13 +; SSE-NEXT: movdqa 48(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm10 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm9 -; SSE-NEXT: movdqa 304(%rdi), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 304(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa 336(%rdi), %xmm12 +; SSE-NEXT: movdqa 336(%rdi), %xmm11 ; SSE-NEXT: movdqa 352(%rdi), %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm15 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa 32(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa 272(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: movdqa 80(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa 272(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: movdqa 224(%rdi), %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa 368(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa 320(%rdi), %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa 176(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa 176(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, %xmm11 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, %xmm3 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd $85, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movapd %xmm9, %xmm14 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -1127,57 +1120,56 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movapd %xmm10, 16(%r8) -; SSE-NEXT: movapd %xmm12, 48(%r8) -; SSE-NEXT: movapd %xmm13, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm3, 48(%r9) -; SSE-NEXT: movapd %xmm5, 16(%r9) -; SSE-NEXT: movapd %xmm6, 32(%r9) -; SSE-NEXT: movapd %xmm8, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movapd %xmm11, 16(%r8) +; SSE-NEXT: movapd %xmm8, 48(%r8) +; SSE-NEXT: movapd %xmm10, 32(%r8) +; SSE-NEXT: movapd %xmm15, (%r8) +; SSE-NEXT: movapd %xmm0, 48(%r9) +; SSE-NEXT: movapd %xmm4, 16(%r9) +; SSE-NEXT: movapd %xmm5, 32(%r9) +; SSE-NEXT: movapd %xmm6, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm14, 16(%rax) -; SSE-NEXT: movapd %xmm7, (%rax) -; SSE-NEXT: movapd %xmm9, 32(%rax) -; SSE-NEXT: movapd %xmm11, 48(%rax) -; SSE-NEXT: addq $376, %rsp # imm = 0x178 +; SSE-NEXT: movapd %xmm13, 16(%rax) +; SSE-NEXT: movapd %xmm9, (%rax) +; SSE-NEXT: movapd %xmm14, 32(%rax) +; SSE-NEXT: movapd %xmm3, 48(%rax) +; SSE-NEXT: addq $360, %rsp # imm = 0x168 ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i32_stride6_vf16: ; AVX1: # %bb.0: -; AVX1-NEXT: subq $328, %rsp # imm = 0x148 -; AVX1-NEXT: vmovaps 96(%rdi), %ymm14 -; AVX1-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX1-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: subq $360, %rsp # imm = 0x168 +; AVX1-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX1-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX1-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX1-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps 352(%rdi), %ymm3 ; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps 224(%rdi), %ymm4 @@ -1203,28 +1195,31 @@ ; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm6, %ymm2 -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,0],ymm2[0,0],ymm14[6,4],ymm2[4,4] +; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm7, %ymm2 +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm6[2,0],ymm2[0,0],ymm6[6,4],ymm2[4,4] +; AVX1-NEXT: vmovaps %ymm6, %ymm7 ; AVX1-NEXT: vshufps {{.*#+}} ymm13 = ymm1[2,0],ymm2[2,2],ymm1[6,4],ymm2[6,6] ; AVX1-NEXT: vmovaps 32(%rdi), %ymm11 ; AVX1-NEXT: vmovaps (%rdi), %ymm8 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1],xmm0[2,3] -; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm13[3,4,5],ymm7[6,7] -; AVX1-NEXT: vmovaps 128(%rdi), %ymm6 +; AVX1-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1],xmm0[2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2],ymm13[3,4,5],ymm14[6,7] +; AVX1-NEXT: vmovaps 128(%rdi), %ymm4 ; AVX1-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm4[4,5],ymm12[6,7] ; AVX1-NEXT: vmovaps %ymm12, %ymm13 ; AVX1-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovaps %ymm4, %ymm14 +; AVX1-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1] ; AVX1-NEXT: vshufps {{.*#+}} ymm12 = ymm4[2,0],ymm1[0,0],ymm4[6,4],ymm1[4,4] ; AVX1-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] -; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7] -; AVX1-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,0],ymm2[1,0],ymm14[7,4],ymm2[5,4] -; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm7[2,0],ymm2[2,3],ymm7[6,4],ymm2[6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] +; AVX1-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm7[3,0],ymm2[1,0],ymm7[7,4],ymm2[5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm2[2,3],ymm6[6,4],ymm2[6,7] ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm3[0,2],xmm0[1,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5],ymm0[6,7] @@ -1232,8 +1227,8 @@ ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm9[1,0],ymm12[7,4],ymm9[5,4] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,0],ymm9[1,0],ymm6[7,4],ymm9[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[2,3],ymm0[6,4],ymm9[6,7] ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,0],xmm15[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm15[1,3] @@ -1244,15 +1239,14 @@ ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6,7] -; AVX1-NEXT: vmovaps %ymm14, %ymm9 -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm14[2,1],ymm10[2,0],ymm14[6,5],ymm10[6,4] +; AVX1-NEXT: vmovaps %ymm7, %ymm9 +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm7[2,1],ymm15[2,0],ymm7[6,5],ymm15[6,4] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,0],xmm2[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4],ymm3[5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm13[4,5],ymm6[6,7] -; AVX1-NEXT: vmovaps %ymm6, %ymm14 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm3[2,0],ymm4[4,4],ymm3[6,4] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] @@ -1260,105 +1254,106 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX1-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm12[2,1],ymm15[2,0],ymm12[6,5],ymm15[6,4] +; AVX1-NEXT: vmovaps %ymm6, %ymm10 +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,1],ymm12[2,0],ymm6[6,5],ymm12[6,4] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 ; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm5[2,0],xmm6[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4],ymm7[5,6,7] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-NEXT: vblendps $207, (%rsp), %ymm13, %ymm7 # 32-byte Folded Reload -; AVX1-NEXT: # ymm7 = mem[0,1,2,3],ymm13[4,5],mem[6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[2,3,0,1] ; AVX1-NEXT: vshufps {{.*#+}} ymm11 = ymm8[0,0],ymm7[2,0],ymm8[4,4],ymm7[6,4] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[3,1],ymm4[4,5],ymm3[7,5] ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1],xmm2[3,3] -; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,1],ymm10[2,1],ymm9[7,5],ymm10[6,5] -; AVX1-NEXT: vmovaps %ymm10, %ymm11 +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,1],ymm15[2,1],ymm9[7,5],ymm15[6,5] +; AVX1-NEXT: vmovaps %ymm15, %ymm0 +; AVX1-NEXT: vmovaps %ymm9, %ymm15 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1],ymm7[3,1],ymm8[4,5],ymm7[7,5] ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm5[3,1],xmm6[3,3] -; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,1],ymm15[2,1],ymm12[7,5],ymm15[6,5] -; AVX1-NEXT: vmovaps %ymm12, %ymm10 +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm10[3,1],ymm12[2,1],ymm10[7,5],ymm12[6,5] +; AVX1-NEXT: vmovaps %ymm12, %ymm1 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1] ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4],ymm3[5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX1-NEXT: # ymm2 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX1-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm14[2,2,3,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX1-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[2,2,3,3] ; AVX1-NEXT: vmovaps 16(%rdi), %xmm5 ; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX1-NEXT: vmovapd 80(%rdi), %xmm6 -; AVX1-NEXT: vshufpd {{.*#+}} ymm7 = ymm6[1],ymm11[0],ymm6[2],ymm11[2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm7 = ymm6[1],ymm0[0],ymm6[2],ymm0[2] ; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,0],ymm9[4,5],ymm7[6,4] -; AVX1-NEXT: vmovaps %ymm9, %ymm12 ; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm2[2,0],ymm7[0,0],ymm2[6,4],ymm7[4,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,2],ymm8[2,0],ymm7[4,6],ymm8[6,4] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm8[5,6,7] -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vblendps $12, (%rsp), %ymm13, %ymm4 # 32-byte Folded Reload -; AVX1-NEXT: # ymm4 = ymm13[0,1],mem[2,3],ymm13[4,5,6,7] -; AVX1-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,2,3,3] -; AVX1-NEXT: vmovaps 208(%rdi), %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX1-NEXT: vmovapd 272(%rdi), %xmm8 -; AVX1-NEXT: vshufpd {{.*#+}} ymm9 = ymm8[1],ymm15[0],ymm8[2],ymm15[2] +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5,6,7] +; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] +; AVX1-NEXT: vmovaps 224(%rdi), %xmm8 +; AVX1-NEXT: vpermilps {{.*#+}} xmm11 = xmm8[2,2,3,3] +; AVX1-NEXT: vmovaps 208(%rdi), %xmm12 +; AVX1-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3] +; AVX1-NEXT: vmovapd 272(%rdi), %xmm14 +; AVX1-NEXT: vshufpd {{.*#+}} ymm9 = ymm14[1],ymm1[0],ymm14[2],ymm1[2] ; AVX1-NEXT: vmovaps %ymm10, %ymm13 ; AVX1-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm4[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm4[2,0],ymm9[0,0],ymm4[6,4],ymm9[4,4] -; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,2],ymm10[2,0],ymm9[4,6],ymm10[6,4] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5,6,7] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm4[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm4[2,0],ymm11[0,0],ymm4[6,4],ymm11[4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,2],ymm10[2,0],ymm11[4,6],ymm10[6,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7] ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm7[1,0],ymm2[7,4],ymm7[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,3],ymm2[2,0],ymm7[4,7],ymm2[6,4] -; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,3] -; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1],ymm11[1,3],ymm6[7,5],ymm11[5,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,1],ymm6[2,0],ymm12[5,5],ymm6[6,4] -; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[1,3,2,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0],ymm9[1,0],ymm4[7,4],ymm9[5,4] -; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,3],ymm4[2,0],ymm9[4,7],ymm4[6,4] -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm8[3,1],ymm15[1,3],ymm8[7,5],ymm15[5,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm13[1,1],ymm3[2,0],ymm13[5,5],ymm3[6,4] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, (%r8) -; AVX1-NEXT: vmovaps %ymm1, 32(%r9) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm1, (%r9) +; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1],ymm0[1,3],ymm6[7,5],ymm0[5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,1],ymm5[2,0],ymm15[5,5],ymm5[6,4] +; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,3,2,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm11[1,0],ymm4[7,4],ymm11[5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,3],ymm3[2,0],ymm11[4,7],ymm3[6,4] +; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1],xmm8[2,3] +; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm14[3,1],ymm1[1,3],ymm14[7,5],ymm1[5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm13[1,1],ymm5[2,0],ymm13[5,5],ymm5[6,4] +; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, (%r8) +; AVX1-NEXT: vmovaps %ymm9, 32(%r9) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, (%r9) ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-NEXT: vmovaps %ymm3, 32(%rax) ; AVX1-NEXT: vmovaps %ymm2, (%rax) -; AVX1-NEXT: addq $328, %rsp # imm = 0x148 +; AVX1-NEXT: addq $360, %rsp # imm = 0x168 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll @@ -120,34 +120,34 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind { ; SSE-LABEL: load_i64_stride2_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm6 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movaps 48(%rdi), %xmm9 -; SSE-NEXT: movaps 80(%rdi), %xmm10 +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps 48(%rdi), %xmm3 +; SSE-NEXT: movaps 80(%rdi), %xmm4 ; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps 112(%rdi), %xmm11 +; SSE-NEXT: movaps 112(%rdi), %xmm6 ; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm9[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: movaps %xmm3, 32(%rsi) -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0] +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm3[0] +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm9, 32(%rsi) +; SSE-NEXT: movaps %xmm11, (%rsi) +; SSE-NEXT: movaps %xmm8, 48(%rsi) +; SSE-NEXT: movaps %xmm10, 16(%rsi) ; SSE-NEXT: movaps %xmm5, 32(%rdx) -; SSE-NEXT: movaps %xmm6, (%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps %xmm7, 48(%rdx) -; SSE-NEXT: movaps %xmm4, 16(%rdx) +; SSE-NEXT: movaps %xmm2, 16(%rdx) ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i64_stride2_vf8: @@ -216,8 +216,8 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind { ; SSE-LABEL: load_i64_stride2_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm9 -; SSE-NEXT: movaps 32(%rdi), %xmm14 +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps 48(%rdi), %xmm8 ; SSE-NEXT: movaps 208(%rdi), %xmm10 ; SSE-NEXT: movaps 192(%rdi), %xmm2 @@ -227,18 +227,18 @@ ; SSE-NEXT: movaps 64(%rdi), %xmm6 ; SSE-NEXT: movaps 240(%rdi), %xmm13 ; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps 176(%rdi), %xmm15 +; SSE-NEXT: movaps 176(%rdi), %xmm14 ; SSE-NEXT: movaps 160(%rdi), %xmm5 -; SSE-NEXT: movaps 112(%rdi), %xmm1 +; SSE-NEXT: movaps 112(%rdi), %xmm15 ; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm15[1] -; SSE-NEXT: movaps %xmm4, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0] +; SSE-NEXT: movaps %xmm7, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] +; SSE-NEXT: movaps %xmm5, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] ; SSE-NEXT: movaps %xmm2, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm10[0] @@ -249,23 +249,23 @@ ; SSE-NEXT: movaps %xmm6, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] -; SSE-NEXT: movaps %xmm14, %xmm12 +; SSE-NEXT: movaps %xmm1, %xmm12 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm8[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm9, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] ; SSE-NEXT: movaps %xmm13, 96(%rsi) ; SSE-NEXT: movaps %xmm10, 64(%rsi) ; SSE-NEXT: movaps %xmm11, 32(%rsi) -; SSE-NEXT: movaps %xmm14, (%rsi) -; SSE-NEXT: movaps %xmm15, 112(%rsi) -; SSE-NEXT: movaps %xmm1, 80(%rsi) -; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm14, 112(%rsi) +; SSE-NEXT: movaps %xmm15, 80(%rsi) +; SSE-NEXT: movaps %xmm9, 48(%rsi) ; SSE-NEXT: movaps %xmm12, 16(%rsi) -; SSE-NEXT: movaps %xmm9, (%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps %xmm6, 32(%rdx) ; SSE-NEXT: movaps %xmm3, 64(%rdx) ; SSE-NEXT: movaps %xmm2, 96(%rdx) @@ -380,61 +380,61 @@ ; SSE-LABEL: load_i64_stride2_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps 272(%rdi), %xmm15 +; SSE-NEXT: movaps 272(%rdi), %xmm8 ; SSE-NEXT: movaps 208(%rdi), %xmm9 -; SSE-NEXT: movaps 192(%rdi), %xmm5 +; SSE-NEXT: movaps 192(%rdi), %xmm2 ; SSE-NEXT: movaps 144(%rdi), %xmm10 -; SSE-NEXT: movaps 128(%rdi), %xmm4 -; SSE-NEXT: movaps 80(%rdi), %xmm13 -; SSE-NEXT: movaps 64(%rdi), %xmm3 -; SSE-NEXT: movaps 304(%rdi), %xmm14 -; SSE-NEXT: movaps 288(%rdi), %xmm12 -; SSE-NEXT: movaps 240(%rdi), %xmm2 -; SSE-NEXT: movaps 224(%rdi), %xmm11 -; SSE-NEXT: movaps 176(%rdi), %xmm7 -; SSE-NEXT: movaps 160(%rdi), %xmm8 -; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 96(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm13[1] +; SSE-NEXT: movaps 128(%rdi), %xmm1 +; SSE-NEXT: movaps 80(%rdi), %xmm12 +; SSE-NEXT: movaps 64(%rdi), %xmm0 +; SSE-NEXT: movaps 304(%rdi), %xmm11 +; SSE-NEXT: movaps 288(%rdi), %xmm6 +; SSE-NEXT: movaps 240(%rdi), %xmm13 +; SSE-NEXT: movaps 224(%rdi), %xmm5 +; SSE-NEXT: movaps 176(%rdi), %xmm15 +; SSE-NEXT: movaps 160(%rdi), %xmm4 +; SSE-NEXT: movaps 112(%rdi), %xmm14 +; SSE-NEXT: movaps 96(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm14[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm15[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm14[0] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps 352(%rdi), %xmm15 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll @@ -172,9 +172,9 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i64_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movapd 128(%rdi), %xmm14 -; SSE-NEXT: movapd 176(%rdi), %xmm13 -; SSE-NEXT: movapd 80(%rdi), %xmm12 +; SSE-NEXT: movapd 128(%rdi), %xmm2 +; SSE-NEXT: movapd 176(%rdi), %xmm1 +; SSE-NEXT: movapd 80(%rdi), %xmm0 ; SSE-NEXT: movapd 96(%rdi), %xmm4 ; SSE-NEXT: movapd 112(%rdi), %xmm8 ; SSE-NEXT: movapd 144(%rdi), %xmm3 @@ -184,34 +184,34 @@ ; SSE-NEXT: movapd 32(%rdi), %xmm5 ; SSE-NEXT: movapd 48(%rdi), %xmm7 ; SSE-NEXT: movapd 64(%rdi), %xmm11 -; SSE-NEXT: movapd %xmm11, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm7[0],xmm15[1] -; SSE-NEXT: movapd %xmm9, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE-NEXT: movapd %xmm8, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] -; SSE-NEXT: movapd %xmm10, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm12[0] -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm13[0] -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm14[0] +; SSE-NEXT: movapd %xmm11, %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm7[0],xmm12[1] +; SSE-NEXT: movapd %xmm9, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm3[0],xmm13[1] +; SSE-NEXT: movapd %xmm8, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1] +; SSE-NEXT: movapd %xmm10, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm6[0],xmm15[1] +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm2[0] ; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm5[0] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm11[0],xmm12[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm9[0],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm8[0],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm8[0],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm10[0],xmm5[1] -; SSE-NEXT: movapd %xmm2, 32(%rsi) -; SSE-NEXT: movapd %xmm0, (%rsi) -; SSE-NEXT: movapd %xmm1, 48(%rsi) -; SSE-NEXT: movapd %xmm15, 16(%rsi) +; SSE-NEXT: movapd %xmm14, 32(%rsi) +; SSE-NEXT: movapd %xmm15, (%rsi) +; SSE-NEXT: movapd %xmm13, 48(%rsi) +; SSE-NEXT: movapd %xmm12, 16(%rsi) ; SSE-NEXT: movapd %xmm4, 32(%rdx) ; SSE-NEXT: movapd %xmm6, (%rdx) ; SSE-NEXT: movapd %xmm3, 48(%rdx) ; SSE-NEXT: movapd %xmm7, 16(%rdx) -; SSE-NEXT: movapd %xmm14, 32(%rcx) +; SSE-NEXT: movapd %xmm2, 32(%rcx) ; SSE-NEXT: movapd %xmm5, (%rcx) -; SSE-NEXT: movapd %xmm13, 48(%rcx) -; SSE-NEXT: movapd %xmm12, 16(%rcx) +; SSE-NEXT: movapd %xmm1, 48(%rcx) +; SSE-NEXT: movapd %xmm0, 16(%rcx) ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i64_stride3_vf8: @@ -327,52 +327,50 @@ ; SSE-LABEL: load_i64_stride3_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movaps 224(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 224(%rdi), %xmm2 ; SSE-NEXT: movapd 128(%rdi), %xmm0 -; SSE-NEXT: movapd 272(%rdi), %xmm6 -; SSE-NEXT: movapd 176(%rdi), %xmm5 +; SSE-NEXT: movapd 272(%rdi), %xmm4 +; SSE-NEXT: movapd 176(%rdi), %xmm3 ; SSE-NEXT: movapd 80(%rdi), %xmm1 -; SSE-NEXT: movapd 192(%rdi), %xmm7 +; SSE-NEXT: movapd 192(%rdi), %xmm5 ; SSE-NEXT: movapd 208(%rdi), %xmm11 -; SSE-NEXT: movapd 96(%rdi), %xmm8 +; SSE-NEXT: movapd 96(%rdi), %xmm6 ; SSE-NEXT: movapd 112(%rdi), %xmm12 -; SSE-NEXT: movapd 240(%rdi), %xmm4 +; SSE-NEXT: movapd 240(%rdi), %xmm7 ; SSE-NEXT: movapd 256(%rdi), %xmm13 -; SSE-NEXT: movapd 144(%rdi), %xmm15 -; SSE-NEXT: movapd 160(%rdi), %xmm9 -; SSE-NEXT: movapd 48(%rdi), %xmm14 -; SSE-NEXT: movapd 64(%rdi), %xmm3 -; SSE-NEXT: movapd %xmm3, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm14[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd 144(%rdi), %xmm10 +; SSE-NEXT: movapd 160(%rdi), %xmm14 +; SSE-NEXT: movapd 48(%rdi), %xmm9 +; SSE-NEXT: movapd 64(%rdi), %xmm15 +; SSE-NEXT: movapd %xmm15, %xmm8 +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm9, %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm15[0],xmm10[1] -; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm5[0] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm9[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm12, %xmm9 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm0[0] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm14, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm10[0],xmm15[1] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm12, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm6[0],xmm14[1] +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm0[0] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm13, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm6[0] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm7[0],xmm12[1] +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm13[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm13[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm11, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm7[0],xmm13[1] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm0[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm5[0],xmm13[1] +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm2[0] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm11[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 336(%rdi), %xmm11 ; SSE-NEXT: movapd 352(%rdi), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm8 @@ -396,11 +394,11 @@ ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] ; SSE-NEXT: movapd %xmm2, 96(%rsi) ; SSE-NEXT: movapd %xmm13, 64(%rsi) -; SSE-NEXT: movapd %xmm9, 32(%rsi) +; SSE-NEXT: movapd %xmm14, 32(%rsi) ; SSE-NEXT: movapd %xmm5, (%rsi) ; SSE-NEXT: movapd %xmm8, 112(%rsi) ; SSE-NEXT: movapd %xmm12, 80(%rsi) -; SSE-NEXT: movapd %xmm10, 48(%rsi) +; SSE-NEXT: movapd %xmm15, 48(%rsi) ; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) ; SSE-NEXT: movapd %xmm1, 96(%rdx) @@ -412,8 +410,8 @@ ; SSE-NEXT: movapd %xmm11, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rdx) -; SSE-NEXT: movapd %xmm15, 48(%rdx) -; SSE-NEXT: movapd %xmm14, 16(%rdx) +; SSE-NEXT: movapd %xmm10, 48(%rdx) +; SSE-NEXT: movapd %xmm9, 16(%rdx) ; SSE-NEXT: movapd %xmm0, 96(%rcx) ; SSE-NEXT: movapd %xmm6, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -508,15 +506,15 @@ ; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5],ymm1[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm9[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm2 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm7[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm4 ; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm5[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] @@ -541,29 +539,29 @@ ; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vmovaps 208(%rdi), %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovaps 16(%rdi), %xmm9 +; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vmovaps 304(%rdi), %xmm10 +; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vmovaps 208(%rdi), %xmm11 +; AVX2-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-NEXT: vmovaps %ymm15, 96(%rsi) -; AVX2-NEXT: vmovaps %ymm14, (%rsi) -; AVX2-NEXT: vmovaps %ymm13, 32(%rsi) +; AVX2-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-NEXT: vmovaps %ymm0, 32(%rsi) ; AVX2-NEXT: vmovaps %ymm6, 96(%rdx) ; AVX2-NEXT: vmovaps %ymm8, (%rdx) ; AVX2-NEXT: vmovaps %ymm5, 32(%rdx) ; AVX2-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX2-NEXT: vmovaps %ymm10, 96(%rcx) +; AVX2-NEXT: vmovaps %ymm9, (%rcx) ; AVX2-NEXT: vmovaps %ymm7, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll @@ -94,28 +94,28 @@ define void @load_i64_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i64_stride4_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movaps 112(%rdi), %xmm8 +; SSE-NEXT: movaps 112(%rdi), %xmm0 ; SSE-NEXT: movaps 80(%rdi), %xmm1 ; SSE-NEXT: movaps (%rdi), %xmm2 ; SSE-NEXT: movaps 16(%rdi), %xmm3 ; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movaps 48(%rdi), %xmm9 +; SSE-NEXT: movaps 48(%rdi), %xmm5 ; SSE-NEXT: movaps 96(%rdi), %xmm6 ; SSE-NEXT: movaps 64(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0] +; SSE-NEXT: movaps %xmm2, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movaps %xmm5, (%rsi) +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movaps %xmm8, 16(%rsi) +; SSE-NEXT: movaps %xmm9, (%rsi) ; SSE-NEXT: movaps %xmm7, 16(%rdx) ; SSE-NEXT: movaps %xmm2, (%rdx) ; SSE-NEXT: movaps %xmm4, 16(%rcx) @@ -202,181 +202,181 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i64_stride4_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps 112(%rdi), %xmm8 -; SSE-NEXT: movaps 80(%rdi), %xmm14 -; SSE-NEXT: movaps 240(%rdi), %xmm9 -; SSE-NEXT: movaps 208(%rdi), %xmm13 +; SSE-NEXT: movaps 112(%rdi), %xmm5 +; SSE-NEXT: movaps 80(%rdi), %xmm3 +; SSE-NEXT: movaps 240(%rdi), %xmm6 +; SSE-NEXT: movaps 208(%rdi), %xmm2 ; SSE-NEXT: movaps 176(%rdi), %xmm10 -; SSE-NEXT: movaps 144(%rdi), %xmm12 +; SSE-NEXT: movaps 144(%rdi), %xmm1 ; SSE-NEXT: movaps (%rdi), %xmm4 -; SSE-NEXT: movaps 16(%rdi), %xmm11 -; SSE-NEXT: movaps 32(%rdi), %xmm15 -; SSE-NEXT: movaps 224(%rdi), %xmm2 +; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps 32(%rdi), %xmm12 +; SSE-NEXT: movaps 224(%rdi), %xmm13 ; SSE-NEXT: movaps 192(%rdi), %xmm7 -; SSE-NEXT: movaps 96(%rdi), %xmm3 -; SSE-NEXT: movaps 64(%rdi), %xmm6 -; SSE-NEXT: movaps 160(%rdi), %xmm1 -; SSE-NEXT: movaps 128(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm15[1] -; SSE-NEXT: movaps %xmm12, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm8[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm9[1] -; SSE-NEXT: movaps 48(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm9[1] -; SSE-NEXT: movaps %xmm3, 48(%rsi) -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps 96(%rdi), %xmm14 +; SSE-NEXT: movaps 64(%rdi), %xmm9 +; SSE-NEXT: movaps 160(%rdi), %xmm15 +; SSE-NEXT: movaps 128(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm15[1] +; SSE-NEXT: movaps %xmm9, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm14[1] +; SSE-NEXT: movaps %xmm7, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] +; SSE-NEXT: movaps %xmm4, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1] +; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] +; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: movaps 48(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm14, 48(%rsi) +; SSE-NEXT: movaps %xmm15, 16(%rsi) +; SSE-NEXT: movaps %xmm11, 32(%rsi) +; SSE-NEXT: movaps %xmm13, (%rsi) ; SSE-NEXT: movaps %xmm7, 48(%rdx) -; SSE-NEXT: movaps %xmm6, 16(%rdx) +; SSE-NEXT: movaps %xmm9, 16(%rdx) ; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movaps %xmm5, 32(%rdx) +; SSE-NEXT: movaps %xmm8, 32(%rdx) ; SSE-NEXT: movaps %xmm10, 16(%rcx) -; SSE-NEXT: movaps %xmm8, 48(%rcx) -; SSE-NEXT: movaps %xmm15, 32(%rcx) -; SSE-NEXT: movaps %xmm14, (%rcx) -; SSE-NEXT: movaps %xmm13, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm12, 32(%r8) -; SSE-NEXT: movaps %xmm11, (%r8) +; SSE-NEXT: movaps %xmm5, 48(%rcx) +; SSE-NEXT: movaps %xmm12, 32(%rcx) +; SSE-NEXT: movaps %xmm3, (%rcx) +; SSE-NEXT: movaps %xmm2, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%r8) +; SSE-NEXT: movaps %xmm1, 32(%r8) +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i64_stride4_vf8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps 224(%rdi), %ymm10 -; AVX1-NEXT: vmovaps 192(%rdi), %ymm11 -; AVX1-NEXT: vmovaps 96(%rdi), %ymm13 -; AVX1-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX1-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX1-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX1-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX1-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX1-NEXT: vmovaps 32(%rdi), %xmm6 ; AVX1-NEXT: vmovaps (%rdi), %xmm7 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm6[0] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX1-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm0[0] -; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX1-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm2[0],xmm1[0] -; AVX1-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX1-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm5[0],xmm4[0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] +; AVX1-NEXT: vmovaps 160(%rdi), %xmm8 +; AVX1-NEXT: vmovaps 128(%rdi), %xmm9 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm9[0],xmm8[0] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX1-NEXT: vmovaps 48(%rdi), %xmm5 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX1-NEXT: vmovaps 224(%rdi), %xmm10 +; AVX1-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0] +; AVX1-NEXT: vmovaps 96(%rdi), %xmm13 +; AVX1-NEXT: vmovaps 64(%rdi), %xmm14 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm8[1] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm11[1],xmm10[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm13[1] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX1-NEXT: vmovaps 48(%rdi), %xmm11 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm13 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm14 = xmm13[0],xmm11[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX1-NEXT: vmovaps 176(%rdi), %xmm1 ; AVX1-NEXT: vmovaps 144(%rdi), %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm1[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm5[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm1[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm13[1],xmm11[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: vmovaps %xmm15, 16(%rsi) ; AVX1-NEXT: vmovaps %xmm12, 48(%rsi) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm1, 32(%rsi) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm1, (%rsi) -; AVX1-NEXT: vmovaps %xmm2, 16(%rdx) -; AVX1-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX1-NEXT: vmovaps %xmm8, (%rdx) +; AVX1-NEXT: vmovaps %xmm9, 16(%rdx) +; AVX1-NEXT: vmovaps %xmm7, 48(%rdx) +; AVX1-NEXT: vmovaps %xmm6, (%rdx) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm1, 32(%rdx) -; AVX1-NEXT: vmovaps %ymm7, 32(%rcx) -; AVX1-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-NEXT: vmovaps %ymm8, 32(%rcx) +; AVX1-NEXT: vmovaps %ymm10, (%rcx) ; AVX1-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-NEXT: vmovaps %ymm5, (%r8) +; AVX1-NEXT: vmovaps %ymm4, (%r8) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_i64_stride4_vf8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX2-NEXT: vmovaps 128(%rdi), %ymm10 -; AVX2-NEXT: vmovaps 32(%rdi), %ymm12 -; AVX2-NEXT: vmovaps (%rdi), %ymm13 -; AVX2-NEXT: vmovaps 96(%rdi), %ymm14 -; AVX2-NEXT: vmovaps 64(%rdi), %ymm15 +; AVX2-NEXT: vmovaps 160(%rdi), %ymm11 +; AVX2-NEXT: vmovaps 128(%rdi), %ymm8 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-NEXT: vmovaps 224(%rdi), %xmm6 ; AVX2-NEXT: vmovaps 192(%rdi), %xmm7 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm6[0] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX2-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX2-NEXT: vmovaps (%rdi), %xmm3 -; AVX2-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] -; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm2[0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm1[1],xmm0[1] -; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm3[0],xmm4[0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm2[1] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] -; AVX2-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX2-NEXT: vmovaps 160(%rdi), %xmm9 +; AVX2-NEXT: vmovaps 128(%rdi), %xmm10 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm9[0] +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX2-NEXT: vmovaps (%rdi), %xmm13 +; AVX2-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX2-NEXT: vmovaps 64(%rdi), %xmm15 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm15[0],xmm12[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm9[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm13[0],xmm14[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm15[1],xmm12[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm15[2,3],ymm14[2,3] +; AVX2-NEXT: vmovaps 224(%rdi), %ymm15 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX2-NEXT: vmovaps %xmm7, 16(%rsi) -; AVX2-NEXT: vmovaps %xmm6, (%rsi) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm5, 32(%rsi) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm5, 48(%rsi) -; AVX2-NEXT: vmovaps %xmm3, (%rdx) -; AVX2-NEXT: vmovaps %xmm2, 16(%rdx) -; AVX2-NEXT: vmovaps %xmm8, 32(%rdx) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm2, 48(%rdx) +; AVX2-NEXT: vmovaps %xmm10, (%rsi) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps %xmm3, 32(%rsi) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps %xmm3, 48(%rsi) +; AVX2-NEXT: vmovaps %xmm13, (%rdx) +; AVX2-NEXT: vmovaps %xmm12, 16(%rdx) +; AVX2-NEXT: vmovaps %xmm9, 32(%rdx) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps %xmm3, 48(%rdx) ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-NEXT: vmovaps %ymm14, (%rcx) ; AVX2-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-NEXT: vmovaps %ymm11, (%r8) +; AVX2-NEXT: vmovaps %ymm2, (%r8) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -434,61 +434,61 @@ ; SSE-LABEL: load_i64_stride4_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps (%rdi), %xmm8 -; SSE-NEXT: movaps 416(%rdi), %xmm13 -; SSE-NEXT: movaps 384(%rdi), %xmm7 -; SSE-NEXT: movaps 288(%rdi), %xmm15 -; SSE-NEXT: movaps 256(%rdi), %xmm9 +; SSE-NEXT: movaps (%rdi), %xmm5 +; SSE-NEXT: movaps 416(%rdi), %xmm0 +; SSE-NEXT: movaps 384(%rdi), %xmm6 +; SSE-NEXT: movaps 288(%rdi), %xmm1 +; SSE-NEXT: movaps 256(%rdi), %xmm7 ; SSE-NEXT: movaps 160(%rdi), %xmm2 ; SSE-NEXT: movaps 128(%rdi), %xmm10 ; SSE-NEXT: movaps 480(%rdi), %xmm3 ; SSE-NEXT: movaps 448(%rdi), %xmm11 ; SSE-NEXT: movaps 352(%rdi), %xmm4 ; SSE-NEXT: movaps 320(%rdi), %xmm12 -; SSE-NEXT: movaps 224(%rdi), %xmm5 -; SSE-NEXT: movaps 192(%rdi), %xmm14 -; SSE-NEXT: movaps 96(%rdi), %xmm6 -; SSE-NEXT: movaps 64(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] +; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps 192(%rdi), %xmm13 +; SSE-NEXT: movaps 96(%rdi), %xmm9 +; SSE-NEXT: movaps 64(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm9[0] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm8[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm8, %xmm15 +; SSE-NEXT: movaps %xmm5, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdi), %xmm0 ; SSE-NEXT: movaps 80(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -586,48 +586,48 @@ ; AVX1-LABEL: load_i64_stride4_vf16: ; AVX1: # %bb.0: ; AVX1-NEXT: subq $296, %rsp # imm = 0x128 -; AVX1-NEXT: vmovaps 224(%rdi), %xmm8 -; AVX1-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm9[0],xmm8[0] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX1-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm10[0] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX1-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 352(%rdi), %xmm4 ; AVX1-NEXT: vmovaps 320(%rdi), %xmm5 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm4[0] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX1-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm12[0] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] +; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 160(%rdi), %xmm7 +; AVX1-NEXT: vmovaps 128(%rdi), %xmm8 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm7[0] +; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX1-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-NEXT: vmovaps 288(%rdi), %xmm6 -; AVX1-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-NEXT: vmovaps 256(%rdi), %xmm11 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] ; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm6[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm11[0],xmm6[0] ; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm8[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm11[1],xmm6[1] +; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm10[0],xmm9[0] +; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1] ; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX1-NEXT: vmovaps 480(%rdi), %xmm1 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm11[1],xmm10[1] -; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 448(%rdi), %xmm4 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm1[0] -; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; AVX1-NEXT: vmovaps 448(%rdi), %xmm2 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm10[1],xmm9[1] +; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] +; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -731,53 +731,53 @@ ; AVX2-LABEL: load_i64_stride4_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $296, %rsp # imm = 0x128 -; AVX2-NEXT: vmovaps 224(%rdi), %xmm11 -; AVX2-NEXT: vmovaps 192(%rdi), %xmm15 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm15[0],xmm11[0] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX2-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm4 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm7[0] +; AVX2-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX2-NEXT: vmovaps 192(%rdi), %xmm6 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm0[0] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 96(%rdi), %xmm9 +; AVX2-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm9[0] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 352(%rdi), %xmm4 +; AVX2-NEXT: vmovaps 320(%rdi), %xmm5 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm4[0] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 160(%rdi), %xmm8 +; AVX2-NEXT: vmovaps 128(%rdi), %xmm12 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm8[0] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 288(%rdi), %xmm13 +; AVX2-NEXT: vmovaps 256(%rdi), %xmm14 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm13[0] +; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 416(%rdi), %xmm15 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm13[1] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 384(%rdi), %xmm13 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm0[1] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 352(%rdi), %xmm8 -; AVX2-NEXT: vmovaps 320(%rdi), %xmm6 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm8[0] +; AVX2-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm8[1] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 448(%rdi), %xmm12 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm9[1] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm0[0] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm0[1] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 160(%rdi), %xmm3 -; AVX2-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm3[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm13[0],xmm15[0] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-NEXT: vmovaps 256(%rdi), %xmm5 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm8[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm15[1] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm15[1],xmm11[1] -; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 480(%rdi), %xmm11 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 448(%rdi), %xmm2 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm7[1] -; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm11[0] -; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm2[0] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm0[0] -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm0[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm2[1] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm8 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -103,54 +103,54 @@ ; SSE-LABEL: load_i64_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps 176(%rdi), %xmm8 -; SSE-NEXT: movaps 128(%rdi), %xmm12 -; SSE-NEXT: movaps 80(%rdi), %xmm9 -; SSE-NEXT: movaps 160(%rdi), %xmm10 -; SSE-NEXT: movaps 112(%rdi), %xmm14 -; SSE-NEXT: movaps 64(%rdi), %xmm11 +; SSE-NEXT: movaps 176(%rdi), %xmm4 +; SSE-NEXT: movaps 128(%rdi), %xmm0 +; SSE-NEXT: movaps 80(%rdi), %xmm6 +; SSE-NEXT: movaps 160(%rdi), %xmm7 +; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps 64(%rdi), %xmm8 ; SSE-NEXT: movaps (%rdi), %xmm5 ; SSE-NEXT: movaps 16(%rdi), %xmm3 -; SSE-NEXT: movaps 32(%rdi), %xmm13 -; SSE-NEXT: movaps 48(%rdi), %xmm6 -; SSE-NEXT: movaps 144(%rdi), %xmm7 -; SSE-NEXT: movaps 96(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0] -; SSE-NEXT: movaps %xmm14, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm10[1] -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm9[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm8[1] -; SSE-NEXT: movaps %xmm15, 16(%rsi) -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps %xmm4, 16(%rdx) +; SSE-NEXT: movaps 32(%rdi), %xmm1 +; SSE-NEXT: movaps 48(%rdi), %xmm9 +; SSE-NEXT: movaps 144(%rdi), %xmm10 +; SSE-NEXT: movaps 96(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm10[0] +; SSE-NEXT: movaps %xmm5, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] +; SSE-NEXT: movaps %xmm3, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, 16(%rsi) +; SSE-NEXT: movaps %xmm13, (%rsi) +; SSE-NEXT: movaps %xmm11, 16(%rdx) ; SSE-NEXT: movaps %xmm5, (%rdx) -; SSE-NEXT: movaps %xmm7, 16(%rcx) -; SSE-NEXT: movaps %xmm6, (%rcx) -; SSE-NEXT: movaps %xmm14, 16(%r8) +; SSE-NEXT: movaps %xmm10, 16(%rcx) +; SSE-NEXT: movaps %xmm9, (%rcx) +; SSE-NEXT: movaps %xmm2, 16(%r8) ; SSE-NEXT: movaps %xmm3, (%r8) -; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: movaps %xmm2, (%r9) -; SSE-NEXT: movaps %xmm12, 16(%rax) -; SSE-NEXT: movaps %xmm13, (%rax) +; SSE-NEXT: movaps %xmm8, 16(%r9) +; SSE-NEXT: movaps %xmm7, (%r9) +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i64_stride6_vf4: ; AVX1: # %bb.0: ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: vmovaps 160(%rdi), %ymm8 +; AVX1-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX1-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm3 @@ -158,34 +158,34 @@ ; AVX1-NEXT: vmovaps (%rdi), %xmm5 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm6 ; AVX1-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX1-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm5[0],xmm0[0] +; AVX1-NEXT: vmovaps 48(%rdi), %xmm8 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm5[0],xmm8[0] ; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm0[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm2 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm6[0],xmm5[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm5[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm2 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] -; AVX1-NEXT: vmovaps 80(%rdi), %xmm6 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm6[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm8[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm3 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX1-NEXT: vmovaps 64(%rdi), %xmm8 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm6[0],xmm8[0] ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm8[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm3 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX1-NEXT: vmovaps 80(%rdi), %xmm8 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm8[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm8[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-NEXT: vmovaps %ymm5, (%rcx) ; AVX1-NEXT: vmovaps %ymm1, (%r8) -; AVX1-NEXT: vmovaps %ymm5, (%r9) -; AVX1-NEXT: vmovaps %ymm2, (%rax) +; AVX1-NEXT: vmovaps %ymm6, (%r9) +; AVX1-NEXT: vmovaps %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -202,30 +202,30 @@ ; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm6[0] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 104(%rdi), %ymm8 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm1[1],ymm8[3],ymm1[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 160(%rdi), %ymm6 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] -; AVX2-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm7[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; AVX2-NEXT: vmovaps 64(%rdi), %xmm8 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm9 = xmm4[0],xmm8[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm4[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm5[0],xmm4[0] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 136(%rdi), %ymm7 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 136(%rdi), %ymm8 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps %ymm9, (%rsi) +; AVX2-NEXT: vmovaps %ymm7, (%rsi) ; AVX2-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-NEXT: vmovaps %ymm6, (%rcx) ; AVX2-NEXT: vmovaps %ymm2, (%r8) @@ -297,63 +297,61 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i64_stride6_vf8: ; SSE: # %bb.0: -; SSE-NEXT: subq $40, %rsp -; SSE-NEXT: movaps 160(%rdi), %xmm8 +; SSE-NEXT: subq $24, %rsp +; SSE-NEXT: movaps 160(%rdi), %xmm9 ; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 352(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 352(%rdi), %xmm8 ; SSE-NEXT: movaps 256(%rdi), %xmm12 -; SSE-NEXT: movaps 208(%rdi), %xmm9 -; SSE-NEXT: movaps 64(%rdi), %xmm7 -; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps 16(%rdi), %xmm10 -; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps 336(%rdi), %xmm3 -; SSE-NEXT: movaps 288(%rdi), %xmm14 -; SSE-NEXT: movaps 144(%rdi), %xmm4 +; SSE-NEXT: movaps 208(%rdi), %xmm1 +; SSE-NEXT: movaps 64(%rdi), %xmm15 +; SSE-NEXT: movaps (%rdi), %xmm3 +; SSE-NEXT: movaps 16(%rdi), %xmm2 +; SSE-NEXT: movaps 48(%rdi), %xmm10 +; SSE-NEXT: movaps 336(%rdi), %xmm14 +; SSE-NEXT: movaps 288(%rdi), %xmm4 +; SSE-NEXT: movaps 144(%rdi), %xmm13 ; SSE-NEXT: movaps 96(%rdi), %xmm5 -; SSE-NEXT: movaps 240(%rdi), %xmm1 -; SSE-NEXT: movaps 192(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: movaps 240(%rdi), %xmm11 +; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] +; SSE-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-NEXT: movaps %xmm5, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movaps %xmm4, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm14[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm15[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 304(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] ; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movaps 32(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm10 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps 272(%rdi), %xmm1 ; SSE-NEXT: movaps 224(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm6 @@ -369,25 +367,25 @@ ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm14, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 48(%rsi) +; SSE-NEXT: movaps %xmm11, 16(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps %xmm15, (%rsi) +; SSE-NEXT: movaps %xmm14, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps %xmm13, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps %xmm12, 16(%rcx) ; SSE-NEXT: movaps %xmm9, 48(%rcx) -; SSE-NEXT: movaps %xmm11, 32(%rcx) +; SSE-NEXT: movaps %xmm15, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps %xmm8, 48(%r8) +; SSE-NEXT: movaps %xmm7, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -402,177 +400,178 @@ ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps %xmm4, 16(%rax) ; SSE-NEXT: movaps %xmm3, 32(%rax) -; SSE-NEXT: movaps %xmm7, (%rax) -; SSE-NEXT: addq $40, %rsp +; SSE-NEXT: movaps %xmm8, (%rax) +; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i64_stride6_vf8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps 352(%rdi), %ymm10 -; AVX1-NEXT: vmovaps 288(%rdi), %ymm12 -; AVX1-NEXT: vmovaps 96(%rdi), %ymm13 +; AVX1-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX1-NEXT: vmovaps 288(%rdi), %ymm4 +; AVX1-NEXT: vmovaps 96(%rdi), %ymm7 ; AVX1-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX1-NEXT: vmovaps 320(%rdi), %ymm5 ; AVX1-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm6 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX1-NEXT: vmovaps 240(%rdi), %xmm2 -; AVX1-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm2[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vmovaps 240(%rdi), %xmm8 +; AVX1-NEXT: vmovaps 192(%rdi), %xmm9 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm8[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm10 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX1-NEXT: vmovaps (%rdi), %xmm11 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm12 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm13 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm14 = xmm11[0],xmm13[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm8 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] -; AVX1-NEXT: vmovaps (%rdi), %xmm1 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm4 -; AVX1-NEXT: vmovaps 48(%rdi), %xmm7 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm7[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX1-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm13[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm3[4,5,6,7] ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX1-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm2[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX1-NEXT: vmovaps 256(%rdi), %xmm3 -; AVX1-NEXT: vmovaps 208(%rdi), %xmm5 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm5[0],xmm3[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-NEXT: vmovaps 160(%rdi), %ymm7 -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX1-NEXT: vmovaps 32(%rdi), %xmm13 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm2[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm2 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] -; AVX1-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm13[0],xmm4[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, 320(%rdi), %ymm0, %ymm5 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm10[0],ymm5[2],ymm10[2] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm9[1],xmm8[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm9 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX1-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm12[0],xmm10[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm11 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] +; AVX1-NEXT: vmovaps 256(%rdi), %xmm13 +; AVX1-NEXT: vmovaps 208(%rdi), %xmm14 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] +; AVX1-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX1-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm12[1],xmm10[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm14[1],xmm13[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm10 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] +; AVX1-NEXT: vmovaps 80(%rdi), %xmm12 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm13 = xmm9[0],xmm12[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, 320(%rdi), %ymm0, %ymm13 +; AVX1-NEXT: vmovaps %ymm0, %ymm3 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] ; AVX1-NEXT: vmovaps 272(%rdi), %xmm1 ; AVX1-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm4[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm10[1],ymm5[3],ymm10[3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm13[1],ymm3[1],ymm13[3],ymm3[3] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX1-NEXT: vmovaps %ymm15, 32(%rdx) +; AVX1-NEXT: vmovaps %ymm5, 32(%rdx) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-NEXT: vmovaps %ymm9, 32(%rcx) -; AVX1-NEXT: vmovaps %ymm11, (%rcx) -; AVX1-NEXT: vmovaps %ymm6, 32(%r8) -; AVX1-NEXT: vmovaps %ymm8, (%r8) -; AVX1-NEXT: vmovaps %ymm12, 32(%r9) -; AVX1-NEXT: vmovaps %ymm3, (%r9) +; AVX1-NEXT: vmovaps %ymm8, 32(%rcx) +; AVX1-NEXT: vmovaps %ymm6, (%rcx) +; AVX1-NEXT: vmovaps %ymm4, 32(%r8) +; AVX1-NEXT: vmovaps %ymm7, (%r8) +; AVX1-NEXT: vmovaps %ymm2, 32(%r9) +; AVX1-NEXT: vmovaps %ymm11, (%r9) ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-NEXT: vmovaps %ymm2, (%rax) +; AVX1-NEXT: vmovaps %ymm9, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_i64_stride6_vf8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps 352(%rdi), %ymm13 +; AVX2-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm4 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-NEXT: vmovaps 240(%rdi), %xmm6 -; AVX2-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm6[0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 192(%rdi), %xmm8 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm6[0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rdi), %xmm5 -; AVX2-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX2-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm5[0],xmm0[0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 104(%rdi), %ymm8 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm0[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 296(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 160(%rdi), %ymm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX2-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm3[0],xmm1[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] -; AVX2-NEXT: vmovaps 256(%rdi), %xmm5 -; AVX2-NEXT: vmovaps 208(%rdi), %xmm6 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm6[0],xmm5[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm1[1] -; AVX2-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX2-NEXT: vmovaps 16(%rdi), %xmm11 +; AVX2-NEXT: vmovaps 48(%rdi), %xmm10 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm5[0],xmm10[0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 104(%rdi), %ymm12 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm4[1],ymm12[3],ymm4[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 296(%rdi), %ymm10 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm6[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 160(%rdi), %ymm8 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-NEXT: vmovaps 64(%rdi), %xmm12 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm11[0],xmm12[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 352(%rdi), %ymm10 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] +; AVX2-NEXT: vmovaps 256(%rdi), %xmm13 +; AVX2-NEXT: vmovaps 208(%rdi), %xmm14 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX2-NEXT: vmovaps 32(%rdi), %xmm12 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm15[1],ymm9[3],ymm15[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm13[1],ymm7[3],ymm13[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vmovaps 80(%rdi), %xmm5 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm3[0],xmm5[0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovaps 80(%rdi), %xmm11 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm15[0],ymm4[2],ymm15[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vmovaps 272(%rdi), %xmm6 -; AVX2-NEXT: vmovaps 224(%rdi), %xmm7 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm13[0],ymm2[2],ymm13[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovaps 272(%rdi), %xmm13 +; AVX2-NEXT: vmovaps 224(%rdi), %xmm14 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-NEXT: vmovlhps {{.*#+}} xmm11 = xmm7[0],xmm6[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm13[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 136(%rdi), %ymm2 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm12[1],xmm11[1] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 136(%rdi), %ymm11 +; AVX2-NEXT: vbroadcastsd 328(%rdi), %ymm11 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm0[1],ymm11[3],ymm0[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 328(%rdi), %ymm3 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm13[1],ymm3[3],ymm13[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm6[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovaps %ymm12, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-NEXT: vmovaps %ymm15, 32(%rdx) -; AVX2-NEXT: vmovaps %ymm14, (%rdx) +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-NEXT: vmovaps %ymm5, (%rdx) ; AVX2-NEXT: vmovaps %ymm10, 32(%rcx) ; AVX2-NEXT: vmovaps %ymm8, (%rcx) -; AVX2-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-NEXT: vmovaps %ymm7, 32(%r8) ; AVX2-NEXT: vmovaps %ymm9, (%r8) -; AVX2-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-NEXT: vmovaps %ymm1, 32(%r9) ; AVX2-NEXT: vmovaps %ymm4, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-NEXT: vmovaps %ymm0, (%rax) +; AVX2-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-NEXT: vmovaps %ymm2, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -244,119 +244,119 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i8_stride3_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm8 -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: packuswb %xmm2, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: por %xmm8, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm8, %xmm11 +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm11, %xmm8 +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: por %xmm12, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm11, %xmm10 +; SSE-NEXT: pand %xmm2, %xmm10 ; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm11, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: pand %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm7, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm3 -; SSE-NEXT: por %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,0] +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm12, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm8, (%rsi) ; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: movdqa %xmm4, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i8_stride3_vf16: @@ -419,241 +419,239 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i8_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm2 ; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm13 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm13, %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movdqa 32(%rdi), %xmm4 +; SSE-NEXT: movdqa 48(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm14, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm5, %xmm10 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: pxor %xmm10, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: movdqa 80(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm9, %xmm15 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm15, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm11 +; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm13[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,3] +; SSE-NEXT: packuswb %xmm11, %xmm0 +; SSE-NEXT: movdqa 80(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pand %xmm15, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: pandn %xmm8, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm11, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,5,6,7,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: por %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: pandn %xmm3, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; SSE-NEXT: pand %xmm15, %xmm7 +; SSE-NEXT: por %xmm11, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,6,7,4] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm12, %xmm10 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: por %xmm10, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm12, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm13 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm13, %xmm15 +; SSE-NEXT: por %xmm3, %xmm15 +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: por %xmm2, %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 ; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: pandn %xmm3, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; SSE-NEXT: pand %xmm7, %xmm14 +; SSE-NEXT: por %xmm11, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm3, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm14, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm13 ; SSE-NEXT: pandn %xmm9, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: por %xmm13, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm12, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movdqa %xmm6, 16(%rdx) -; SSE-NEXT: movdqa %xmm10, (%rdx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: movdqa %xmm7, (%rcx) +; SSE-NEXT: movdqa %xmm8, (%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rcx) +; SSE-NEXT: movdqa %xmm11, (%rcx) ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i8_stride3_vf32: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -163,41 +163,41 @@ ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] +; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[1,3,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] -; SSE-NEXT: packuswb %xmm7, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: packuswb %xmm8, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] @@ -282,114 +282,114 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i8_stride4_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rdi), %xmm12 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rdi), %xmm7 +; SSE-NEXT: movdqa 48(%rdi), %xmm11 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0] -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm3, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movdqa %xmm6, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: packuswb %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm1[0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; SSE-NEXT: packuswb %xmm8, %xmm14 +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm10[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm13[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: packuswb %xmm15, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm14[0,3] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4] ; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm11, %xmm6 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: packuswb %xmm5, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: packuswb %xmm3, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: packuswb %xmm5, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm6[0,3] +; SSE-NEXT: packuswb %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] ; SSE-NEXT: movdqa %xmm0, (%rsi) -; SSE-NEXT: movaps %xmm4, (%rdx) +; SSE-NEXT: movaps %xmm9, (%rdx) ; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps %xmm3, (%r8) +; SSE-NEXT: movaps %xmm5, (%r8) ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i8_stride4_vf16: @@ -406,7 +406,7 @@ ; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 @@ -421,22 +421,22 @@ ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm6 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm2 +; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm8 ; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm8, (%rsi) +; AVX1-NEXT: vmovdqa %xmm2, (%rsi) ; AVX1-NEXT: vmovdqa %xmm5, (%rdx) -; AVX1-NEXT: vmovdqa %xmm2, (%rcx) +; AVX1-NEXT: vmovdqa %xmm6, (%rcx) ; AVX1-NEXT: vmovdqa %xmm0, (%r8) ; AVX1-NEXT: retq ; @@ -454,7 +454,7 @@ ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 @@ -469,28 +469,28 @@ ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm4 +; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm8 ; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-NEXT: vmovdqa %xmm8, (%rsi) +; AVX2-NEXT: vmovdqa %xmm4, (%rsi) ; AVX2-NEXT: vmovdqa %xmm5, (%rdx) -; AVX2-NEXT: vmovdqa %xmm4, (%rcx) +; AVX2-NEXT: vmovdqa %xmm6, (%rcx) ; AVX2-NEXT: vmovdqa %xmm0, (%r8) ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_i8_stride4_vf16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqu64 (%rdi), %zmm8 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm3 @@ -509,22 +509,22 @@ ; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm6 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm0 +; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm8 ; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX512-NEXT: vpmovdb %zmm8, (%rsi) +; AVX512-NEXT: vpmovdb %zmm0, (%rsi) ; AVX512-NEXT: vmovdqa %xmm5, (%rdx) -; AVX512-NEXT: vmovdqa %xmm0, (%rcx) +; AVX512-NEXT: vmovdqa %xmm6, (%rcx) ; AVX512-NEXT: vmovdqa %xmm1, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -547,129 +547,130 @@ ; SSE-LABEL: load_i8_stride4_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $120, %rsp -; SSE-NEXT: movdqa 64(%rdi), %xmm11 -; SSE-NEXT: movdqa 80(%rdi), %xmm10 +; SSE-NEXT: movdqa 64(%rdi), %xmm4 +; SSE-NEXT: movdqa 80(%rdi), %xmm13 ; SSE-NEXT: movdqa 96(%rdi), %xmm15 -; SSE-NEXT: movdqa 112(%rdi), %xmm13 -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rdi), %xmm9 -; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa 112(%rdi), %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movdqa 32(%rdi), %xmm8 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0] ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: packuswb %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: packuswb %xmm1, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm4 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: packuswb %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: packuswb %xmm11, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm7[0,3] ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] -; SSE-NEXT: packuswb %xmm0, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm6[0,3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: packuswb %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: packuswb %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: packuswb %xmm1, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm0, %xmm3 @@ -677,212 +678,212 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm14 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm3[0,3] -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[0,3] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] ; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm15[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm6 -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm6[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm8[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: packuswb %xmm6, %xmm7 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: packuswb %xmm2, %xmm8 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: pshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: packuswb %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm8[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: packuswb %xmm8, %xmm9 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: packuswb %xmm6, %xmm7 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: packuswb %xmm6, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm7[0,3] -; SSE-NEXT: movdqa %xmm2, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: packuswb %xmm8, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm9[0,3] +; SSE-NEXT: movdqa %xmm6, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps %xmm4, 16(%rdx) -; SSE-NEXT: movaps %xmm12, (%rdx) +; SSE-NEXT: movaps %xmm5, (%rdx) ; SSE-NEXT: movaps %xmm3, 16(%rcx) -; SSE-NEXT: movaps %xmm9, (%rcx) -; SSE-NEXT: movaps %xmm5, 16(%r8) +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps %xmm7, 16(%r8) ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: addq $120, %rsp ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i8_stride4_vf32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-NEXT: vmovdqa 112(%rdi), %xmm10 -; AVX1-NEXT: vpshufb %xmm0, %xmm10, %xmm1 -; AVX1-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm3 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vmovdqa 80(%rdi), %xmm13 -; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm4 ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm6 +; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm6 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 -; AVX1-NEXT: vmovdqa (%rdi), %xmm12 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm14 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm1 -; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-NEXT: vpshufb %xmm0, %xmm10, %xmm1 -; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm2 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm4 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-NEXT: vpshufb %xmm0, %xmm10, %xmm1 -; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm2 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm4 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-NEXT: vpshufb %xmm1, %xmm10, %xmm2 -; AVX1-NEXT: vpshufb %xmm1, %xmm11, %xmm3 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm4 -; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm11 +; AVX1-NEXT: vpshufb %xmm8, %xmm6, %xmm8 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] +; AVX1-NEXT: vpshufb %xmm9, %xmm4, %xmm11 +; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm9 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm10 +; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm11 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm12 +; AVX1-NEXT: vpshufb %xmm11, %xmm5, %xmm13 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm12 +; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm9 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] +; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm12 +; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm11 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-NEXT: vpshufb %xmm10, %xmm0, %xmm11 +; AVX1-NEXT: vpshufb %xmm10, %xmm2, %xmm12 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm13 +; AVX1-NEXT: vpshufb %xmm12, %xmm5, %xmm14 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-NEXT: vpshufb %xmm10, %xmm7, %xmm13 +; AVX1-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm13 +; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm12 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpshufb %xmm11, %xmm7, %xmm3 +; AVX1-NEXT: vpshufb %xmm11, %xmm6, %xmm5 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX1-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovaps %ymm8, (%rsi) ; AVX1-NEXT: vmovaps %ymm9, (%rdx) -; AVX1-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-NEXT: vmovaps %ymm1, (%r8) +; AVX1-NEXT: vmovaps %ymm10, (%rcx) +; AVX1-NEXT: vmovaps %ymm0, (%r8) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_i8_stride4_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm11 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4 @@ -892,70 +893,70 @@ ; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm0 +; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8 ; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %ymm7, %ymm11, %ymm8 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm9 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184] +; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 +; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm8 ; AVX2-NEXT: vpermd %ymm8, %ymm6, %ymm8 -; AVX2-NEXT: vpshufb %ymm7, %ymm9, %ymm7 -; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7 -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm7 -; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm1 -; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %ymm1, %ymm11, %ymm7 -; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7 -; AVX2-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm1, %xmm3, %xmm7 -; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %ymm1, %ymm11, %ymm7 -; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7 -; AVX2-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9 +; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 +; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10 +; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 +; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9 +; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm10 +; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm9 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm11 +; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm10 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm11 +; AVX2-NEXT: vpermd %ymm11, %ymm6, %ymm11 +; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX2-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm3 -; AVX2-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX2-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm10, (%rsi) +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa %ymm7, (%rsi) ; AVX2-NEXT: vmovdqa %ymm8, (%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -141,60 +141,60 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pxor %xmm4, %xmm4 ; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] ; SSE-NEXT: packuswb %xmm6, %xmm6 ; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: andps %xmm2, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm3[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: andps %xmm2, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm3[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movd %xmm9, (%rsi) +; SSE-NEXT: movd %xmm1, (%rsi) ; SSE-NEXT: movd %xmm5, (%rdx) -; SSE-NEXT: movd %xmm1, (%rcx) -; SSE-NEXT: movd %xmm6, (%r8) +; SSE-NEXT: movd %xmm6, (%rcx) +; SSE-NEXT: movd %xmm7, (%r8) ; SSE-NEXT: movd %xmm0, (%r9) ; SSE-NEXT: movd %xmm2, (%rax) ; SSE-NEXT: retq @@ -314,148 +314,148 @@ ; SSE-LABEL: load_i8_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm8 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pand %xmm13, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm4, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: pxor %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,0,65535,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm6, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pand %xmm3, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm3 -; SSE-NEXT: por %xmm11, %xmm3 -; SSE-NEXT: pand %xmm3, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,65535] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: por %xmm9, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm13, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm9, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,4,5,6] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm14, %xmm10 +; SSE-NEXT: por %xmm13, %xmm10 +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm14, %xmm12 +; SSE-NEXT: pandn %xmm13, %xmm14 +; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: pand %xmm9, %xmm14 +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[3,0] +; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,1,0,2] +; SSE-NEXT: packuswb %xmm13, %xmm13 +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: por %xmm14, %xmm12 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm11 +; SSE-NEXT: por %xmm3, %xmm11 +; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,1,0,2] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm6 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3] +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movq %xmm10, (%rsi) -; SSE-NEXT: movq %xmm14, (%rdx) -; SSE-NEXT: movq %xmm2, (%rcx) -; SSE-NEXT: movq %xmm7, (%r8) -; SSE-NEXT: movq %xmm5, (%r9) -; SSE-NEXT: movq %xmm6, (%rax) +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm5, %xmm9 +; SSE-NEXT: movq %xmm2, (%rsi) +; SSE-NEXT: movq %xmm1, (%rdx) +; SSE-NEXT: movq %xmm10, (%rcx) +; SSE-NEXT: movq %xmm12, (%r8) +; SSE-NEXT: movq %xmm3, (%r9) +; SSE-NEXT: movq %xmm9, (%rax) ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i8_stride6_vf8: @@ -470,13 +470,13 @@ ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[4,10,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm8 +; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm9 +; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 @@ -485,28 +485,28 @@ ; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX1-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovq %xmm8, (%rsi) -; AVX1-NEXT: vmovq %xmm9, (%rdx) +; AVX1-NEXT: vmovq %xmm3, (%rsi) +; AVX1-NEXT: vmovq %xmm4, (%rdx) ; AVX1-NEXT: vmovq %xmm5, (%rcx) -; AVX1-NEXT: vmovq %xmm3, (%r8) -; AVX1-NEXT: vmovq %xmm4, (%r9) +; AVX1-NEXT: vmovq %xmm7, (%r8) +; AVX1-NEXT: vmovq %xmm8, (%r9) ; AVX1-NEXT: vmovq %xmm0, (%rax) ; AVX1-NEXT: retq ; @@ -607,88 +607,86 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i8_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm8 -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa 32(%rdi), %xmm12 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 +; SSE-NEXT: movdqa 32(%rdi), %xmm10 ; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm12, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm10, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: pand %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: pand %xmm3, %xmm14 +; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm7 -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm13, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm9, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm9 +; SSE-NEXT: por %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm13, %xmm13 +; SSE-NEXT: pxor %xmm9, %xmm9 ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: psrld $16, %xmm0 @@ -696,28 +694,29 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; SSE-NEXT: packuswb %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[1,3,2,0,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,0,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm8 +; SSE-NEXT: pand %xmm6, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: por %xmm14, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535] @@ -725,14 +724,14 @@ ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 ; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: por %xmm6, %xmm11 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -740,85 +739,83 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: por %xmm11, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm9, %xmm12 -; SSE-NEXT: por %xmm12, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: por %xmm12, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,2] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,0,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] @@ -827,9 +824,9 @@ ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] @@ -838,61 +835,62 @@ ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm12, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm9, %xmm12 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: pand %xmm12, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm1[2,3] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm1[2,3] ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] ; SSE-NEXT: packuswb %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm3 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 ; SSE-NEXT: packuswb %xmm6, %xmm6 ; SSE-NEXT: pand %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm2, %xmm6 ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,0,0] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4] -; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,7,4] +; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: packuswb %xmm2, %xmm1 @@ -900,10 +898,10 @@ ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movdqa %xmm8, (%rdx) -; SSE-NEXT: movdqa %xmm11, (%rcx) +; SSE-NEXT: movdqa %xmm11, (%rdx) +; SSE-NEXT: movdqa %xmm8, (%rcx) ; SSE-NEXT: movdqa %xmm4, (%r8) -; SSE-NEXT: movdqa %xmm3, (%r9) +; SSE-NEXT: movdqa %xmm7, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm0, (%rax) ; SSE-NEXT: retq @@ -921,86 +919,86 @@ ; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[2,8,14,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[4,10] ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX1-NEXT: vpblendvb %xmm10, %xmm8, %xmm6, %xmm8 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1,2],xmm9[3,4,5],xmm6[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,11] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpblendvb %xmm10, %xmm9, %xmm6, %xmm9 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm7[0],xmm6[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u> -; AVX1-NEXT: vpblendvb %xmm12, %xmm11, %xmm6, %xmm11 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[0,6,12] -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpblendvb %xmm10, %xmm11, %xmm6, %xmm11 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm6[0],xmm7[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpblendvb %xmm12, %xmm13, %xmm6, %xmm12 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[1,7,13] -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpblendvb %xmm10, %xmm12, %xmm6, %xmm10 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm12 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> -; AVX1-NEXT: vpblendvb %xmm13, %xmm12, %xmm6, %xmm12 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,8,14] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3,4],xmm6[5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero +; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX1-NEXT: vpblendvb %xmm9, %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,11] +; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX1-NEXT: vpblendvb %xmm9, %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u> +; AVX1-NEXT: vpblendvb %xmm11, %xmm8, %xmm10, %xmm8 +; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[0,6,12] +; AVX1-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX1-NEXT: vpblendvb %xmm9, %xmm8, %xmm10, %xmm8 +; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm10[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX1-NEXT: vpblendvb %xmm11, %xmm10, %xmm12, %xmm10 +; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[1,7,13] +; AVX1-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX1-NEXT: vpblendvb %xmm9, %xmm10, %xmm11, %xmm9 +; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> +; AVX1-NEXT: vpblendvb %xmm12, %xmm10, %xmm11, %xmm10 +; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,8,14] +; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero +; AVX1-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm11[5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-NEXT: vpblendvb %xmm13, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendvb %xmm12, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[3,9,15] ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX1-NEXT: vmovdqa %xmm8, (%rsi) -; AVX1-NEXT: vmovdqa %xmm9, (%rdx) -; AVX1-NEXT: vmovdqa %xmm11, (%rcx) -; AVX1-NEXT: vmovdqa %xmm10, (%r8) -; AVX1-NEXT: vmovdqa %xmm6, (%r9) +; AVX1-NEXT: vmovdqa %xmm6, (%rsi) +; AVX1-NEXT: vmovdqa %xmm7, (%rdx) +; AVX1-NEXT: vmovdqa %xmm8, (%rcx) +; AVX1-NEXT: vmovdqa %xmm9, (%r8) +; AVX1-NEXT: vmovdqa %xmm10, (%r9) ; AVX1-NEXT: vmovdqa %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_i8_stride6_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-NEXT: vpblendvb %ymm0, %ymm8, %ymm4, %ymm5 +; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] @@ -1008,247 +1006,247 @@ ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[4,10] ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero -; AVX2-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %xmm11, %xmm2, %xmm3, %xmm9 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero -; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpblendvb %xmm11, %xmm3, %xmm5, %xmm10 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-NEXT: vpblendvb %ymm3, %ymm4, %ymm8, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[u,u,u,u,u] -; AVX2-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12] -; AVX2-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX2-NEXT: vpblendvb %xmm11, %xmm2, %xmm5, %xmm12 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero +; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %xmm8, %xmm2, %xmm7, %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpblendvb %xmm11, %xmm3, %xmm5, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm8, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm5[0,6,12],zero,zero,zero,xmm5[4,10,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[4,10],zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11] +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14] -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero -; AVX2-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3,4],xmm2[5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[1,7,13],zero,zero,zero,xmm5[5,11,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[5,11],zero,zero,zero,xmm4[3,9,15],zero,zero,xmm4[u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpblendvb %xmm8, %xmm5, %xmm6, %xmm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm6 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[2,8,14],zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[u,u,u,u,u] +; AVX2-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12] +; AVX2-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX2-NEXT: vpblendvb %xmm8, %xmm9, %xmm10, %xmm9 +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[3,9,15,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,9,15],zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[u,u,u,u,u] +; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] +; AVX2-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX2-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14] +; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero +; AVX2-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,9,15] ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4],xmm0[5,6,7] -; AVX2-NEXT: vmovdqa %xmm9, (%rsi) -; AVX2-NEXT: vmovdqa %xmm10, (%rdx) -; AVX2-NEXT: vmovdqa %xmm12, (%rcx) -; AVX2-NEXT: vmovdqa %xmm3, (%r8) -; AVX2-NEXT: vmovdqa %xmm2, (%r9) +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] +; AVX2-NEXT: vmovdqa %xmm2, (%rsi) +; AVX2-NEXT: vmovdqa %xmm5, (%rdx) +; AVX2-NEXT: vmovdqa %xmm9, (%rcx) +; AVX2-NEXT: vmovdqa %xmm6, (%r8) +; AVX2-NEXT: vmovdqa %xmm7, (%r9) ; AVX2-NEXT: vmovdqa %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_i8_stride6_vf16: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa (%rdi), %xmm5 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512-NEXT: vpextrb $2, %xmm2, %eax +; AVX512-NEXT: vpextrb $2, %xmm2, %r10d ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[0,6,12],zero,xmm5[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $8, %xmm2, %eax -; AVX512-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $14, %xmm2, %eax -; AVX512-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $4, %xmm1, %eax -; AVX512-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $10, %xmm1, %eax -; AVX512-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $6, %xmm0, %eax -; AVX512-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vpextrb $12, %xmm0, %eax -; AVX512-NEXT: vpinsrb $10, %eax, %xmm3, %xmm4 +; AVX512-NEXT: vpinsrb $3, %r10d, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $8, %xmm2, %r10d +; AVX512-NEXT: vpinsrb $4, %r10d, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $14, %xmm2, %r10d +; AVX512-NEXT: vpinsrb $5, %r10d, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $4, %xmm1, %r10d +; AVX512-NEXT: vpinsrb $6, %r10d, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $10, %xmm1, %r10d +; AVX512-NEXT: vpinsrb $7, %r10d, %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm0, %r10d +; AVX512-NEXT: vpinsrb $8, %r10d, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $6, %xmm0, %r10d +; AVX512-NEXT: vpinsrb $9, %r10d, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $12, %xmm0, %r10d +; AVX512-NEXT: vpinsrb $10, %r10d, %xmm3, %xmm4 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512-NEXT: vpextrb $2, %xmm3, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 -; AVX512-NEXT: vpextrb $8, %xmm3, %eax -; AVX512-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 -; AVX512-NEXT: vpextrb $14, %xmm3, %eax -; AVX512-NEXT: vpinsrb $13, %eax, %xmm4, %xmm6 +; AVX512-NEXT: vpextrb $2, %xmm3, %r10d +; AVX512-NEXT: vpinsrb $11, %r10d, %xmm4, %xmm4 +; AVX512-NEXT: vpextrb $8, %xmm3, %r10d +; AVX512-NEXT: vpinsrb $12, %r10d, %xmm4, %xmm4 +; AVX512-NEXT: vpextrb $14, %xmm3, %r10d +; AVX512-NEXT: vpinsrb $13, %r10d, %xmm4, %xmm6 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512-NEXT: vpextrb $4, %xmm4, %eax -; AVX512-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $10, %xmm4, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm6, %xmm8 -; AVX512-NEXT: vpextrb $3, %xmm2, %eax -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,7,13],zero,xmm5[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpinsrb $3, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $9, %xmm2, %eax -; AVX512-NEXT: vpinsrb $4, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $15, %xmm2, %eax -; AVX512-NEXT: vpinsrb $5, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $5, %xmm1, %eax -; AVX512-NEXT: vpinsrb $6, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $11, %xmm1, %eax -; AVX512-NEXT: vpinsrb $7, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $1, %xmm0, %eax -; AVX512-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $7, %xmm0, %eax -; AVX512-NEXT: vpinsrb $9, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $13, %xmm0, %eax -; AVX512-NEXT: vpinsrb $10, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $3, %xmm3, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $9, %xmm3, %eax -; AVX512-NEXT: vpinsrb $12, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $15, %xmm3, %eax -; AVX512-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $5, %xmm4, %eax -; AVX512-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $11, %xmm4, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm6, %xmm9 -; AVX512-NEXT: vpextrb $4, %xmm2, %eax -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[2,8,14],zero,xmm5[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpinsrb $3, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $10, %xmm2, %eax -; AVX512-NEXT: vpinsrb $4, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: vpinsrb $5, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $6, %xmm1, %eax -; AVX512-NEXT: vpinsrb $6, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $12, %xmm1, %eax -; AVX512-NEXT: vpinsrb $7, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $2, %xmm0, %eax -; AVX512-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $8, %xmm0, %eax -; AVX512-NEXT: vpinsrb $9, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $14, %xmm0, %eax -; AVX512-NEXT: vpinsrb $10, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $4, %xmm3, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $10, %xmm3, %eax -; AVX512-NEXT: vpinsrb $12, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vmovd %xmm4, %eax -; AVX512-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $6, %xmm4, %eax -; AVX512-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $12, %xmm4, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm6, %xmm10 -; AVX512-NEXT: vpextrb $5, %xmm2, %eax -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[3,9,15],zero,xmm5[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpinsrb $3, %eax, %xmm7, %xmm7 -; AVX512-NEXT: vpextrb $11, %xmm2, %eax -; AVX512-NEXT: vpinsrb $4, %eax, %xmm7, %xmm7 -; AVX512-NEXT: vpextrb $1, %xmm1, %eax -; AVX512-NEXT: vpinsrb $5, %eax, %xmm7, %xmm7 -; AVX512-NEXT: vpextrb $7, %xmm1, %eax -; AVX512-NEXT: vpinsrb $6, %eax, %xmm7, %xmm7 -; AVX512-NEXT: vpextrb $13, %xmm1, %eax -; AVX512-NEXT: vpinsrb $7, %eax, %xmm7, %xmm7 -; AVX512-NEXT: vpextrb $3, %xmm0, %eax -; AVX512-NEXT: vpinsrb $8, %eax, %xmm7, %xmm7 -; AVX512-NEXT: vpextrb $9, %xmm0, %eax -; AVX512-NEXT: vpinsrb $9, %eax, %xmm7, %xmm7 -; AVX512-NEXT: vpextrb $15, %xmm0, %eax -; AVX512-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7 -; AVX512-NEXT: vpextrb $5, %xmm3, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm7, %xmm7 -; AVX512-NEXT: vpextrb $11, %xmm3, %eax -; AVX512-NEXT: vpinsrb $12, %eax, %xmm7, %xmm7 -; AVX512-NEXT: vpextrb $1, %xmm4, %eax -; AVX512-NEXT: vpinsrb $13, %eax, %xmm7, %xmm7 -; AVX512-NEXT: vpextrb $7, %xmm4, %eax -; AVX512-NEXT: vpinsrb $14, %eax, %xmm7, %xmm7 -; AVX512-NEXT: vpextrb $13, %xmm4, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm7, %xmm7 -; AVX512-NEXT: vpextrb $10, %xmm5, %eax -; AVX512-NEXT: vpextrb $4, %xmm5, %edi -; AVX512-NEXT: vmovd %edi, %xmm6 -; AVX512-NEXT: vpinsrb $1, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vmovd %xmm2, %eax -; AVX512-NEXT: vpinsrb $2, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $6, %xmm2, %eax -; AVX512-NEXT: vpinsrb $3, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $12, %xmm2, %eax -; AVX512-NEXT: vpinsrb $4, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $2, %xmm1, %eax -; AVX512-NEXT: vpinsrb $5, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $8, %xmm1, %eax -; AVX512-NEXT: vpinsrb $6, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $14, %xmm1, %eax -; AVX512-NEXT: vpinsrb $7, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $4, %xmm0, %eax -; AVX512-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $10, %xmm0, %eax -; AVX512-NEXT: vpinsrb $9, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vmovd %xmm3, %eax -; AVX512-NEXT: vpinsrb $10, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $6, %xmm3, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $12, %xmm3, %eax -; AVX512-NEXT: vpinsrb $12, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $2, %xmm4, %eax -; AVX512-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $8, %xmm4, %eax -; AVX512-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $14, %xmm4, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm6, %xmm6 -; AVX512-NEXT: vpextrb $11, %xmm5, %eax -; AVX512-NEXT: vpextrb $5, %xmm5, %edi -; AVX512-NEXT: vmovd %edi, %xmm5 -; AVX512-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $1, %xmm2, %eax -; AVX512-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $7, %xmm2, %eax -; AVX512-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 -; AVX512-NEXT: vpextrb $13, %xmm2, %eax -; AVX512-NEXT: vpinsrb $4, %eax, %xmm5, %xmm2 -; AVX512-NEXT: vpextrb $3, %xmm1, %eax -; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX512-NEXT: vpextrb $9, %xmm1, %eax -; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX512-NEXT: vpextrb $15, %xmm1, %eax -; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm1 -; AVX512-NEXT: vpextrb $5, %xmm0, %eax -; AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX512-NEXT: vpextrb $11, %xmm0, %eax -; AVX512-NEXT: vpinsrb $9, %eax, %xmm1, %xmm0 -; AVX512-NEXT: vpextrb $1, %xmm3, %eax -; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $7, %xmm3, %eax -; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $13, %xmm3, %eax -; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $3, %xmm4, %eax -; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $9, %xmm4, %eax -; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $15, %xmm4, %eax -; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm8, (%rsi) -; AVX512-NEXT: vmovdqa %xmm9, (%rdx) -; AVX512-NEXT: vmovdqa %xmm10, (%rcx) -; AVX512-NEXT: vmovdqa %xmm7, (%r8) -; AVX512-NEXT: vmovdqa %xmm6, (%r9) -; AVX512-NEXT: vmovdqa %xmm0, (%r10) +; AVX512-NEXT: vpextrb $4, %xmm4, %edi +; AVX512-NEXT: vpinsrb $14, %edi, %xmm6, %xmm6 +; AVX512-NEXT: vpextrb $10, %xmm4, %edi +; AVX512-NEXT: vpinsrb $15, %edi, %xmm6, %xmm6 +; AVX512-NEXT: vpextrb $3, %xmm2, %edi +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[1,7,13],zero,xmm5[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpinsrb $3, %edi, %xmm7, %xmm7 +; AVX512-NEXT: vpextrb $9, %xmm2, %edi +; AVX512-NEXT: vpinsrb $4, %edi, %xmm7, %xmm7 +; AVX512-NEXT: vpextrb $15, %xmm2, %edi +; AVX512-NEXT: vpinsrb $5, %edi, %xmm7, %xmm7 +; AVX512-NEXT: vpextrb $5, %xmm1, %edi +; AVX512-NEXT: vpinsrb $6, %edi, %xmm7, %xmm7 +; AVX512-NEXT: vpextrb $11, %xmm1, %edi +; AVX512-NEXT: vpinsrb $7, %edi, %xmm7, %xmm7 +; AVX512-NEXT: vpextrb $1, %xmm0, %edi +; AVX512-NEXT: vpinsrb $8, %edi, %xmm7, %xmm7 +; AVX512-NEXT: vpextrb $7, %xmm0, %edi +; AVX512-NEXT: vpinsrb $9, %edi, %xmm7, %xmm7 +; AVX512-NEXT: vpextrb $13, %xmm0, %edi +; AVX512-NEXT: vpinsrb $10, %edi, %xmm7, %xmm7 +; AVX512-NEXT: vpextrb $3, %xmm3, %edi +; AVX512-NEXT: vpinsrb $11, %edi, %xmm7, %xmm7 +; AVX512-NEXT: vpextrb $9, %xmm3, %edi +; AVX512-NEXT: vpinsrb $12, %edi, %xmm7, %xmm7 +; AVX512-NEXT: vpextrb $15, %xmm3, %edi +; AVX512-NEXT: vpinsrb $13, %edi, %xmm7, %xmm7 +; AVX512-NEXT: vpextrb $5, %xmm4, %edi +; AVX512-NEXT: vpinsrb $14, %edi, %xmm7, %xmm7 +; AVX512-NEXT: vpextrb $11, %xmm4, %edi +; AVX512-NEXT: vpinsrb $15, %edi, %xmm7, %xmm7 +; AVX512-NEXT: vpextrb $4, %xmm2, %edi +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[2,8,14],zero,xmm5[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpinsrb $3, %edi, %xmm8, %xmm8 +; AVX512-NEXT: vpextrb $10, %xmm2, %edi +; AVX512-NEXT: vpinsrb $4, %edi, %xmm8, %xmm8 +; AVX512-NEXT: vmovd %xmm1, %edi +; AVX512-NEXT: vpinsrb $5, %edi, %xmm8, %xmm8 +; AVX512-NEXT: vpextrb $6, %xmm1, %edi +; AVX512-NEXT: vpinsrb $6, %edi, %xmm8, %xmm8 +; AVX512-NEXT: vpextrb $12, %xmm1, %edi +; AVX512-NEXT: vpinsrb $7, %edi, %xmm8, %xmm8 +; AVX512-NEXT: vpextrb $2, %xmm0, %edi +; AVX512-NEXT: vpinsrb $8, %edi, %xmm8, %xmm8 +; AVX512-NEXT: vpextrb $8, %xmm0, %edi +; AVX512-NEXT: vpinsrb $9, %edi, %xmm8, %xmm8 +; AVX512-NEXT: vpextrb $14, %xmm0, %edi +; AVX512-NEXT: vpinsrb $10, %edi, %xmm8, %xmm8 +; AVX512-NEXT: vpextrb $4, %xmm3, %edi +; AVX512-NEXT: vpinsrb $11, %edi, %xmm8, %xmm8 +; AVX512-NEXT: vpextrb $10, %xmm3, %edi +; AVX512-NEXT: vpinsrb $12, %edi, %xmm8, %xmm8 +; AVX512-NEXT: vmovd %xmm4, %edi +; AVX512-NEXT: vpinsrb $13, %edi, %xmm8, %xmm8 +; AVX512-NEXT: vpextrb $6, %xmm4, %edi +; AVX512-NEXT: vpinsrb $14, %edi, %xmm8, %xmm8 +; AVX512-NEXT: vpextrb $12, %xmm4, %edi +; AVX512-NEXT: vpinsrb $15, %edi, %xmm8, %xmm8 +; AVX512-NEXT: vpextrb $5, %xmm2, %edi +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[3,9,15],zero,xmm5[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpinsrb $3, %edi, %xmm9, %xmm9 +; AVX512-NEXT: vpextrb $11, %xmm2, %edi +; AVX512-NEXT: vpinsrb $4, %edi, %xmm9, %xmm9 +; AVX512-NEXT: vpextrb $1, %xmm1, %edi +; AVX512-NEXT: vpinsrb $5, %edi, %xmm9, %xmm9 +; AVX512-NEXT: vpextrb $7, %xmm1, %edi +; AVX512-NEXT: vpinsrb $6, %edi, %xmm9, %xmm9 +; AVX512-NEXT: vpextrb $13, %xmm1, %edi +; AVX512-NEXT: vpinsrb $7, %edi, %xmm9, %xmm9 +; AVX512-NEXT: vpextrb $3, %xmm0, %edi +; AVX512-NEXT: vpinsrb $8, %edi, %xmm9, %xmm9 +; AVX512-NEXT: vpextrb $9, %xmm0, %edi +; AVX512-NEXT: vpinsrb $9, %edi, %xmm9, %xmm9 +; AVX512-NEXT: vpextrb $15, %xmm0, %edi +; AVX512-NEXT: vpinsrb $10, %edi, %xmm9, %xmm9 +; AVX512-NEXT: vpextrb $5, %xmm3, %edi +; AVX512-NEXT: vpinsrb $11, %edi, %xmm9, %xmm9 +; AVX512-NEXT: vpextrb $11, %xmm3, %edi +; AVX512-NEXT: vpinsrb $12, %edi, %xmm9, %xmm9 +; AVX512-NEXT: vpextrb $1, %xmm4, %edi +; AVX512-NEXT: vpinsrb $13, %edi, %xmm9, %xmm9 +; AVX512-NEXT: vpextrb $7, %xmm4, %edi +; AVX512-NEXT: vpinsrb $14, %edi, %xmm9, %xmm9 +; AVX512-NEXT: vpextrb $13, %xmm4, %edi +; AVX512-NEXT: vpinsrb $15, %edi, %xmm9, %xmm9 +; AVX512-NEXT: vpextrb $10, %xmm5, %edi +; AVX512-NEXT: vpextrb $4, %xmm5, %r10d +; AVX512-NEXT: vmovd %r10d, %xmm10 +; AVX512-NEXT: vpinsrb $1, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vmovd %xmm2, %edi +; AVX512-NEXT: vpinsrb $2, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrb $6, %xmm2, %edi +; AVX512-NEXT: vpinsrb $3, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrb $12, %xmm2, %edi +; AVX512-NEXT: vpinsrb $4, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrb $2, %xmm1, %edi +; AVX512-NEXT: vpinsrb $5, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrb $8, %xmm1, %edi +; AVX512-NEXT: vpinsrb $6, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrb $14, %xmm1, %edi +; AVX512-NEXT: vpinsrb $7, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrb $4, %xmm0, %edi +; AVX512-NEXT: vpinsrb $8, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrb $10, %xmm0, %edi +; AVX512-NEXT: vpinsrb $9, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vmovd %xmm3, %edi +; AVX512-NEXT: vpinsrb $10, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrb $6, %xmm3, %edi +; AVX512-NEXT: vpinsrb $11, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrb $12, %xmm3, %edi +; AVX512-NEXT: vpinsrb $12, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrb $2, %xmm4, %edi +; AVX512-NEXT: vpinsrb $13, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrb $8, %xmm4, %edi +; AVX512-NEXT: vpinsrb $14, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrb $14, %xmm4, %edi +; AVX512-NEXT: vpinsrb $15, %edi, %xmm10, %xmm10 +; AVX512-NEXT: vpextrb $11, %xmm5, %edi +; AVX512-NEXT: vpextrb $5, %xmm5, %r10d +; AVX512-NEXT: vmovd %r10d, %xmm5 +; AVX512-NEXT: vpinsrb $1, %edi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $1, %xmm2, %edi +; AVX512-NEXT: vpinsrb $2, %edi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $7, %xmm2, %edi +; AVX512-NEXT: vpinsrb $3, %edi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $13, %xmm2, %edi +; AVX512-NEXT: vpinsrb $4, %edi, %xmm5, %xmm2 +; AVX512-NEXT: vpextrb $3, %xmm1, %edi +; AVX512-NEXT: vpinsrb $5, %edi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrb $9, %xmm1, %edi +; AVX512-NEXT: vpinsrb $6, %edi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrb $15, %xmm1, %edi +; AVX512-NEXT: vpinsrb $7, %edi, %xmm2, %xmm1 +; AVX512-NEXT: vpextrb $5, %xmm0, %edi +; AVX512-NEXT: vpinsrb $8, %edi, %xmm1, %xmm1 +; AVX512-NEXT: vpextrb $11, %xmm0, %edi +; AVX512-NEXT: vpinsrb $9, %edi, %xmm1, %xmm0 +; AVX512-NEXT: vpextrb $1, %xmm3, %edi +; AVX512-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $7, %xmm3, %edi +; AVX512-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $13, %xmm3, %edi +; AVX512-NEXT: vpinsrb $12, %edi, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $3, %xmm4, %edi +; AVX512-NEXT: vpinsrb $13, %edi, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $9, %xmm4, %edi +; AVX512-NEXT: vpinsrb $14, %edi, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $15, %xmm4, %edi +; AVX512-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm6, (%rsi) +; AVX512-NEXT: vmovdqa %xmm7, (%rdx) +; AVX512-NEXT: vmovdqa %xmm8, (%rcx) +; AVX512-NEXT: vmovdqa %xmm9, (%r8) +; AVX512-NEXT: vmovdqa %xmm10, (%r9) +; AVX512-NEXT: vmovdqa %xmm0, (%rax) ; AVX512-NEXT: retq %wide.vec = load <96 x i8>, ptr %in.vec, align 32 @@ -1272,143 +1270,145 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i8_stride6_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $264, %rsp # imm = 0x108 -; SSE-NEXT: movdqa 64(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm14 -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: subq $280, %rsp # imm = 0x118 +; SSE-NEXT: movdqa 64(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm15 +; SSE-NEXT: movdqa 16(%rdi), %xmm13 ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: pand %xmm4, %xmm15 -; SSE-NEXT: por %xmm5, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: pand %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: pand %xmm5, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pand %xmm10, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm13, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa 144(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: movdqa 160(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm8, %xmm4 -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm13 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm14 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: movdqa 160(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm13, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: movdqa 96(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: pand %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pand %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: pand %xmm11, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] @@ -1416,120 +1416,119 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm5, %xmm5 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: por %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm9, %xmm5 +; SSE-NEXT: pandn %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm10, %xmm10 -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[0,1,0,3] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; SSE-NEXT: packuswb %xmm14, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; SSE-NEXT: packuswb %xmm7, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: packuswb %xmm6, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pand %xmm11, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm2[8],xmm8[9],xmm2[9],xmm8[10],xmm2[10],xmm8[11],xmm2[11],xmm8[12],xmm2[12],xmm8[13],xmm2[13],xmm8[14],xmm2[14],xmm8[15],xmm2[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: packuswb %xmm1, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: packuswb %xmm3, %xmm7 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: packuswb %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: packuswb %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm15 -; SSE-NEXT: por %xmm15, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm14 +; SSE-NEXT: por %xmm14, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -1537,457 +1536,454 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: por (%rsp), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,3,2,1] +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,2,0,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,2,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pand %xmm11, %xmm5 -; SSE-NEXT: por %xmm5, %xmm15 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[3,0] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[3,0] ; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,3,2,1] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] ; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,0,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm13 +; SSE-NEXT: packuswb %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm13, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm13, %xmm15 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: packuswb %xmm0, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm3, %xmm13 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 ; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[2,3] -; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: pand %xmm13, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm4 ; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3] -; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[2,3] +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm8[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: por %xmm7, %xmm10 +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,5,7,4] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: por %xmm8, %xmm11 +; SSE-NEXT: packuswb %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm9, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[2,3] +; SSE-NEXT: psrlq $48, %xmm9 +; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm10, %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm5 +; SSE-NEXT: por %xmm11, %xmm5 ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,7,4] -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,7,4] +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 ; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm14, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm14, 16(%r8) -; SSE-NEXT: movdqa %xmm15, (%r8) -; SSE-NEXT: movdqa %xmm2, 16(%r9) -; SSE-NEXT: movdqa %xmm13, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movdqa %xmm12, (%r8) +; SSE-NEXT: movdqa %xmm4, 16(%r9) +; SSE-NEXT: movdqa %xmm6, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm6, (%rax) -; SSE-NEXT: addq $264, %rsp # imm = 0x108 +; SSE-NEXT: movdqa %xmm8, (%rax) +; SSE-NEXT: addq $280, %rsp # imm = 0x118 ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i8_stride6_vf32: ; AVX1: # %bb.0: ; AVX1-NEXT: subq $168, %rsp -; AVX1-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vmovdqa 144(%rdi), %xmm7 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovdqa 144(%rdi), %xmm5 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10] -; AVX1-NEXT: vmovdqa %xmm2, %xmm15 -; AVX1-NEXT: vmovdqa 160(%rdi), %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero -; AVX1-NEXT: vmovdqa %xmm3, %xmm6 +; AVX1-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[4,10] +; AVX1-NEXT: vmovdqa 160(%rdi), %xmm7 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX1-NEXT: vpblendvb %xmm8, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[5,11] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[5,11] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpblendvb %xmm8, %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm13 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm1, %xmm13, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX1-NEXT: vpshufb %xmm8, %xmm11, %xmm3 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm14 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-NEXT: vpshufb %xmm1, %xmm15, %xmm3 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; AVX1-NEXT: vmovdqa (%rdi), %xmm10 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[4,10,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[2,8,14],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u> -; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm12, %xmm13, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-NEXT: vpshufb %xmm5, %xmm11, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm9[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[3,9,15],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm9 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm8[4,10,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[2,8,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm2, %xmm10, %xmm10 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u> +; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm10, %xmm3 +; AVX1-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm14, %xmm10 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm12 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm10[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm8[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[3,9,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX1-NEXT: vpblendvb %xmm2, %xmm10, %xmm12, %xmm0 ; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm0 -; AVX1-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm8, %xmm14, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[0,6,12] -; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-NEXT: vpblendvb %xmm3, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm6[0,6,12] +; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm0 -; AVX1-NEXT: vpshufb %xmm5, %xmm14, %xmm1 +; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm0 +; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[1,7,13] -; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm6[1,7,13] +; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[4,10],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm9[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm8[0,6,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> ; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[5,11],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm9[1,7,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm8[1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <128,128,128,2,8,14,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = <128,128,128,2,8,14,u,u,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm1 +; AVX1-NEXT: vpshufb %xmm10, %xmm3, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <0,6,12,128,128,128,u,u,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm2 @@ -1995,48 +1991,48 @@ ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10] ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero -; AVX1-NEXT: vpor %xmm4, %xmm15, %xmm4 +; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero +; AVX1-NEXT: vpor %xmm4, %xmm13, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX1-NEXT: vpshufb %xmm12, %xmm9, %xmm7 -; AVX1-NEXT: vpshufb %xmm6, %xmm10, %xmm6 -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; AVX1-NEXT: vpshufb %xmm10, %xmm8, %xmm10 +; AVX1-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX1-NEXT: vpor %xmm6, %xmm10, %xmm6 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4,5],xmm6[6,7] ; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandnps %ymm4, %ymm6, %ymm4 ; AVX1-NEXT: vandps %ymm6, %ymm5, %ymm5 ; AVX1-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm7 # 16-byte Folded Reload -; AVX1-NEXT: vandnps %ymm7, %ymm15, %ymm7 -; AVX1-NEXT: vorps %ymm7, %ymm4, %ymm4 +; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm10 # 16-byte Folded Reload +; AVX1-NEXT: vandnps %ymm10, %ymm5, %ymm10 +; AVX1-NEXT: vorps %ymm4, %ymm10, %ymm4 ; AVX1-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,3,9,15,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <1,7,13,128,128,128,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm12 -; AVX1-NEXT: vpor %xmm7, %xmm12, %xmm12 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] -; AVX1-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero -; AVX1-NEXT: vpor %xmm7, %xmm14, %xmm7 -; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm12 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; AVX1-NEXT: vpshufb %xmm5, %xmm9, %xmm7 -; AVX1-NEXT: vpshufb %xmm8, %xmm10, %xmm5 -; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] -; AVX1-NEXT: vandnps %ymm12, %ymm6, %ymm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,3,9,15,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm13 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <1,7,13,128,128,128,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm12 +; AVX1-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] +; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX1-NEXT: vpor %xmm13, %xmm11, %xmm11 +; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 +; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX1-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm12[3,4,5],xmm4[6,7] +; AVX1-NEXT: vandnps %ymm11, %ymm6, %ymm7 ; AVX1-NEXT: vandps %ymm6, %ymm4, %ymm4 -; AVX1-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX1-NEXT: vandnps %ymm5, %ymm15, %ymm5 +; AVX1-NEXT: vorps %ymm7, %ymm4, %ymm4 +; AVX1-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload +; AVX1-NEXT: vandnps %ymm6, %ymm5, %ymm5 ; AVX1-NEXT: vorps %ymm5, %ymm4, %ymm4 ; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm3[4,10,u,u,u,u,u,u,u,u,u,u,u] @@ -2046,87 +2042,87 @@ ; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] ; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandnps %ymm5, %ymm7, %ymm5 -; AVX1-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX1-NEXT: vorps %ymm5, %ymm6, %ymm6 -; AVX1-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vandps %ymm6, %ymm15, %ymm6 +; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandnps %ymm5, %ymm6, %ymm5 +; AVX1-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX1-NEXT: vorps %ymm5, %ymm7, %ymm7 +; AVX1-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vandps %ymm7, %ymm10, %ymm7 ; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 16-byte Folded Reload -; AVX1-NEXT: vandnps %ymm8, %ymm15, %ymm8 -; AVX1-NEXT: vorps %ymm6, %ymm8, %ymm9 -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm10 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] -; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 -; AVX1-NEXT: vandnps %ymm4, %ymm7, %ymm4 -; AVX1-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX1-NEXT: vorps %ymm4, %ymm6, %ymm4 -; AVX1-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload -; AVX1-NEXT: vandnps %ymm6, %ymm15, %ymm6 -; AVX1-NEXT: vorps %ymm6, %ymm4, %ymm10 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[4,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm7 -; AVX1-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-NEXT: vmovaps {{.*#+}} ymm13 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload -; AVX1-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-NEXT: vorps %ymm7, %ymm4, %ymm14 +; AVX1-NEXT: vandnps %ymm8, %ymm10, %ymm8 +; AVX1-NEXT: vorps %ymm7, %ymm8, %ymm13 +; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm3[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] +; AVX1-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-NEXT: vandnps %ymm7, %ymm6, %ymm7 +; AVX1-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-NEXT: vandps %ymm6, %ymm10, %ymm6 +; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload +; AVX1-NEXT: vandnps %ymm7, %ymm10, %ymm7 +; AVX1-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm9 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm12 +; AVX1-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 +; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload +; AVX1-NEXT: vandps %ymm7, %ymm9, %ymm7 +; AVX1-NEXT: vorps %ymm7, %ymm12, %ymm7 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm11, %xmm5, %xmm7 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm12, %xmm8, %xmm6 -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm11 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm12[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm11[5,6,7] -; AVX1-NEXT: vandps %ymm15, %ymm14, %ymm11 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-NEXT: vandnps %ymm4, %ymm15, %ymm4 -; AVX1-NEXT: vorps %ymm4, %ymm11, %ymm11 +; AVX1-NEXT: vpshufb %xmm8, %xmm5, %xmm8 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm11 +; AVX1-NEXT: vpor %xmm8, %xmm11, %xmm8 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm15[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3,4],xmm8[5,6,7] +; AVX1-NEXT: vandps %ymm7, %ymm10, %ymm7 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-NEXT: vandnps %ymm8, %ymm10, %ymm8 +; AVX1-NEXT: vorps %ymm7, %ymm8, %ymm7 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX1-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX1-NEXT: vandps %ymm0, %ymm9, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm1 -; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm2 ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX1-NEXT: vandps %ymm0, %ymm15, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vandnps %ymm1, %ymm15, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-NEXT: vmovaps %ymm9, (%rcx) -; AVX1-NEXT: vmovaps %ymm10, (%r8) -; AVX1-NEXT: vmovaps %ymm11, (%r9) +; AVX1-NEXT: vmovaps %ymm13, (%rcx) +; AVX1-NEXT: vmovaps %ymm6, (%r8) +; AVX1-NEXT: vmovaps %ymm7, (%r9) ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-NEXT: vmovaps %ymm0, (%rax) ; AVX1-NEXT: addq $168, %rsp @@ -2135,117 +2131,114 @@ ; ; AVX2-LABEL: load_i8_stride6_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm14 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm12 +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-NEXT: vpblendvb %ymm9, %ymm14, %ymm13, %ymm7 +; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm7 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[0,6,12,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm8, %xmm11 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm4[2,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm3[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,10],zero,zero,zero,ymm3[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[20,26] ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm4[0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,8,14],zero,zero,ymm4[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[18,24,30],zero,zero ; AVX2-NEXT: vpor %ymm0, %ymm8, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm8, %ymm11, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[5,11],zero,zero,zero,ymm3[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[21,27] -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,9,15],zero,zero,ymm4[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,25,31],zero,zero -; AVX2-NEXT: vpor %ymm2, %ymm7, %ymm2 -; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-NEXT: vpblendvb %ymm0, %ymm13, %ymm14, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[2,8,14],zero,zero,xmm2[0,6,12],zero,zero,zero,xmm2[u,u,u,u,u] -; AVX2-NEXT: vpor %xmm1, %xmm7, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm3[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,6,12],zero,zero,zero,ymm3[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,22,28] -; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,10],zero,zero,zero,ymm4[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[20,26],zero,zero,zero -; AVX2-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,ymm3[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[5,11],zero,zero,zero,ymm3[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[21,27] +; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm4[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,9,15],zero,zero,ymm4[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,25,31],zero,zero +; AVX2-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm10, %ymm7 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm10 +; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] +; AVX2-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,ymm3[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,6,12],zero,zero,zero,ymm3[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,22,28] +; AVX2-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,10],zero,zero,zero,ymm4[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[20,26],zero,zero,zero +; AVX2-NEXT: vpor %ymm14, %ymm15, %ymm14 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm9 -; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13],zero,zero,zero,xmm2[u,u,u,u,u] -; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm3[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,7,13],zero,zero,zero,ymm3[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,23,29] -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[5,11],zero,zero,zero,ymm4[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[21,27],zero,zero,zero -; AVX2-NEXT: vpor %ymm2, %ymm7, %ymm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero -; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] -; AVX2-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm9 +; AVX2-NEXT: vpblendvb %ymm8, %ymm13, %ymm14, %ymm13 +; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] +; AVX2-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,ymm3[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,7,13],zero,zero,zero,ymm3[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,23,29] +; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[5,11],zero,zero,zero,ymm4[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[21,27],zero,zero,zero +; AVX2-NEXT: vpor %ymm11, %ymm14, %ymm11 +; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero +; AVX2-NEXT: vpblendvb %ymm8, %ymm10, %ymm11, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm8 +; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u],zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12] +; AVX2-NEXT: vpor %xmm14, %xmm10, %xmm10 +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm5, %ymm8 -; AVX2-NEXT: vpblendvb %ymm15, %ymm12, %ymm6, %ymm10 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] -; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-NEXT: vpblendvb %ymm0, %ymm12, %ymm6, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[4,10],zero,zero,zero,xmm1[2,8,14] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,2,8,14],zero,zero,xmm0[0,6,12],zero,zero,zero -; AVX2-NEXT: vpor %xmm2, %xmm5, %xmm12 -; AVX2-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm5 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[4,10,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[4,10],zero,zero,zero,xmm5[2,8,14],zero,zero,xmm5[u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,6,12],zero,zero,zero,ymm4[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,22,28],zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,ymm3[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,8,14],zero,zero,ymm3[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[18,24,30] -; AVX2-NEXT: vpor %ymm7, %ymm13, %ymm7 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm7[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm7 -; AVX2-NEXT: vpblendvb %ymm11, %ymm2, %ymm7, %ymm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[5,11],zero,zero,zero,xmm1[3,9,15] +; AVX2-NEXT: vpblendvb %ymm11, %ymm13, %ymm10, %ymm13 +; AVX2-NEXT: vpblendvb %ymm15, %ymm5, %ymm6, %ymm10 +; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13] +; AVX2-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm8, %ymm9 +; AVX2-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[2,8,14] +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,2,8,14],zero,zero,xmm0[0,6,12],zero,zero,zero +; AVX2-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-NEXT: vpblendvb %ymm15, %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[4,10],zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm8, %xmm12, %xmm8 +; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = ymm4[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,6,12],zero,zero,zero,ymm4[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,22,28],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,ymm3[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,8,14],zero,zero,ymm3[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[18,24,30] +; AVX2-NEXT: vpor %ymm12, %ymm14, %ymm12 +; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm12[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-NEXT: vpblendvb %ymm11, %ymm8, %ymm6, %ymm6 +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[3,9,15] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,3,9,15],zero,zero,xmm0[1,7,13],zero,zero,zero -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[5,11,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,11],zero,zero,zero,xmm5[3,9,15],zero,zero,xmm5[u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,7,13],zero,zero,zero,ymm4[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[17,23,29],zero,zero,zero +; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,7,13],zero,zero,zero,ymm4[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[17,23,29],zero,zero,zero ; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,9,15],zero,zero,ymm3[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,25,31] -; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm3 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero -; AVX2-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u],zero,zero,xmm2[0,6,12],zero,zero,zero,xmm2[4,10] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero +; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero -; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,xmm2[1,7,13],zero,zero,zero,xmm2[5,11] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqa %ymm1, (%rsi) -; AVX2-NEXT: vmovdqa %ymm3, (%rdx) -; AVX2-NEXT: vmovdqa %ymm8, (%rcx) +; AVX2-NEXT: vmovdqa %ymm2, (%rdx) +; AVX2-NEXT: vmovdqa %ymm13, (%rcx) ; AVX2-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-NEXT: vmovdqa %ymm6, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -2253,118 +2246,118 @@ ; ; AVX512-LABEL: load_i8_stride6_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm11 -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,8,14],zero,zero,ymm11[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[18,24,30],zero,zero -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm2[2,3],mem[2,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm12[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[4,10],zero,zero,zero,ymm12[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,26] +; AVX512-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm0 +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,8,14],zero,zero,ymm0[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,24,30],zero,zero +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm2[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,10],zero,zero,zero,ymm2[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[20,26] ; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX512-NEXT: movw $18724, %ax # imm = 0x4924 -; AVX512-NEXT: kmovd %eax, %k1 -; AVX512-NEXT: vpblendmw %ymm13, %ymm10, %ymm7 {%k1} +; AVX512-NEXT: movw $18724, %r10w # imm = 0x4924 +; AVX512-NEXT: kmovd %r10d, %k1 +; AVX512-NEXT: vpblendmw %ymm1, %ymm6, %ymm7 {%k1} ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512-NEXT: movl $4192256, %eax # imm = 0x3FF800 -; AVX512-NEXT: kmovd %eax, %k2 +; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512-NEXT: movl $4192256, %r10d # imm = 0x3FF800 +; AVX512-NEXT: kmovd %r10d, %k2 ; AVX512-NEXT: vmovdqu8 %ymm4, %ymm5 {%k2} ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512-NEXT: vpblendmw %ymm9, %ymm4, %ymm6 {%k1} -; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u],zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,4,10],zero,zero,zero,xmm6[2,8,14],zero,zero -; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[3,9,15],zero,zero,ymm11[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[19,25,31],zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm12[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[5,11],zero,zero,zero,ymm12[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[21,27] -; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vmovdqu8 %ymm1, %ymm2 {%k2} -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,xmm0[1,7,13],zero,zero,zero,xmm0[5,11] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,5,11],zero,zero,zero,xmm6[3,9,15],zero,zero -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,10],zero,zero,zero,ymm11[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,26],zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm12[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,6,12],zero,zero,zero,ymm12[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[16,22,28] -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: movw $9362, %ax # imm = 0x2492 -; AVX512-NEXT: kmovd %eax, %k3 -; AVX512-NEXT: vpblendmw %ymm10, %ymm13, %ymm1 {%k3} -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm6, %xmm8 -; AVX512-NEXT: movl $2095104, %eax # imm = 0x1FF800 -; AVX512-NEXT: kmovd %eax, %k4 -; AVX512-NEXT: vmovdqu8 %ymm0, %ymm8 {%k4} -; AVX512-NEXT: vpblendmw %ymm4, %ymm9, %ymm0 {%k1} -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,0,6,12],zero,zero,zero,xmm0[4,10],zero,zero,zero -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12] -; AVX512-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: movl $-2097152, %eax # imm = 0xFFE00000 -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: vmovdqu8 %ymm3, %ymm8 {%k2} -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[5,11],zero,zero,zero,ymm11[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[21,27],zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm12[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[1,7,13],zero,zero,zero,ymm12[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[17,23,29] -; AVX512-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[3,9,15],zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u] -; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqu8 %ymm3, %ymm1 {%k4} -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,7,13],zero,zero,zero,xmm0[5,11],zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13] -; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqu8 %ymm0, %ymm1 {%k2} -; AVX512-NEXT: vmovdqu16 %ymm10, %ymm13 {%k1} -; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[4,10],zero,zero,zero,xmm13[2,8,14],zero,zero,xmm13[u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm12[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,8,14],zero,zero,ymm12[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[18,24,30] -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[0,6,12],zero,zero,zero,ymm11[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,22,28],zero,zero,zero -; AVX512-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vmovdqu16 %ymm9, %ymm4 {%k3} +; AVX512-NEXT: vpblendmw %ymm3, %ymm4, %ymm9 {%k1} +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[4,10] +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,4,10],zero,zero,zero,xmm9[2,8,14],zero,zero +; AVX512-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2],ymm11[3,4,5,6,7],ymm5[8,9,10],ymm11[11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[3,9,15],zero,zero,ymm0[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[19,25,31],zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,ymm2[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[5,11],zero,zero,zero,ymm2[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[21,27] +; AVX512-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512-NEXT: vmovdqu8 %ymm11, %ymm7 {%k2} +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[5,11] +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,5,11],zero,zero,zero,xmm9[3,9,15],zero,zero +; AVX512-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[2,8,14],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,10],zero,zero,zero,ymm0[18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[20,26],zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,ymm2[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,6,12],zero,zero,zero,ymm2[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,22,28] +; AVX512-NEXT: vpor %ymm8, %ymm9, %ymm9 +; AVX512-NEXT: movw $9362, %di # imm = 0x2492 +; AVX512-NEXT: kmovd %edi, %k3 +; AVX512-NEXT: vpblendmw %ymm6, %ymm1, %ymm10 {%k3} +; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] +; AVX512-NEXT: vpor %xmm8, %xmm12, %xmm8 +; AVX512-NEXT: movl $2095104, %edi # imm = 0x1FF800 +; AVX512-NEXT: kmovd %edi, %k4 +; AVX512-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4} +; AVX512-NEXT: vpblendmw %ymm4, %ymm3, %ymm9 {%k1} +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u],zero,zero,zero,xmm13[2,8,14],zero,zero,xmm13[0,6,12] +; AVX512-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 +; AVX512-NEXT: kmovd %edi, %k2 +; AVX512-NEXT: vmovdqu8 %ymm12, %ymm8 {%k2} +; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[3,9,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,11],zero,zero,zero,ymm0[19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[21,27],zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,ymm2[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,7,13],zero,zero,zero,ymm2[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[17,23,29] +; AVX512-NEXT: vpor %ymm12, %ymm14, %ymm12 +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] +; AVX512-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX512-NEXT: vmovdqu8 %ymm12, %ymm10 {%k4} +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u],zero,zero,zero,xmm13[3,9,15],zero,zero,xmm13[1,7,13] +; AVX512-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512-NEXT: vmovdqu8 %ymm9, %ymm10 {%k2} +; AVX512-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1} +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[4,10,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[4,10],zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,ymm2[0,6,12],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14],zero,zero,ymm2[16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[18,24,30] +; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[4,10],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,6,12],zero,zero,zero,ymm0[20,26],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,22,28],zero,zero,zero +; AVX512-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512-NEXT: vmovdqu16 %ymm3, %ymm4 {%k3} ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u],zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[2,8,14] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero -; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vmovdqu8 %ymm5, %ymm2 {%k2} -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,7,13],zero,zero,zero,xmm0[5,11,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[5,11],zero,zero,zero,xmm13[3,9,15],zero,zero,xmm13[u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm12[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[3,9,15],zero,zero,ymm12[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[19,25,31] -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm11[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[1,7,13],zero,zero,zero,ymm11[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[17,23,29],zero,zero,zero -; AVX512-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[3,9,15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero -; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vmovdqu8 %ymm3, %ymm0 {%k2} -; AVX512-NEXT: vmovdqa %ymm14, (%rsi) +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u],zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[2,8,14] +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero +; AVX512-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512-NEXT: vmovdqu8 %ymm11, %ymm9 {%k2} +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[5,11,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[1,7,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15],zero,zero,ymm2[17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,25,31] +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[5,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[1,7,13],zero,zero,zero,ymm0[21,27],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[17,23,29],zero,zero,zero +; AVX512-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u],zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[3,9,15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero +; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} +; AVX512-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512-NEXT: vmovdqa %ymm7, (%rdx) ; AVX512-NEXT: vmovdqa %ymm8, (%rcx) -; AVX512-NEXT: vmovdqa %ymm1, (%r8) -; AVX512-NEXT: vmovdqa %ymm2, (%r9) -; AVX512-NEXT: vmovdqa %ymm0, (%r10) +; AVX512-NEXT: vmovdqa %ymm10, (%r8) +; AVX512-NEXT: vmovdqa %ymm9, (%r9) +; AVX512-NEXT: vmovdqa %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <192 x i8>, ptr %in.vec, align 32 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll @@ -192,9 +192,9 @@ ; SSE-NEXT: movdqa (%rsi), %xmm4 ; SSE-NEXT: movdqa 16(%rsi), %xmm5 ; SSE-NEXT: movdqa 32(%rsi), %xmm6 -; SSE-NEXT: movdqa 48(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE-NEXT: movdqa 48(%rsi), %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] @@ -203,8 +203,8 @@ ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] ; SSE-NEXT: movdqa %xmm3, 96(%rdx) ; SSE-NEXT: movdqa %xmm6, 112(%rdx) ; SSE-NEXT: movdqa %xmm2, 64(%rdx) @@ -212,7 +212,7 @@ ; SSE-NEXT: movdqa %xmm1, 32(%rdx) ; SSE-NEXT: movdqa %xmm4, 48(%rdx) ; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm7, 16(%rdx) +; SSE-NEXT: movdqa %xmm8, 16(%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vf32: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -308,77 +308,77 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm8 -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa (%rsi), %xmm12 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa (%rsi), %xmm2 ; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa (%rdx), %xmm9 -; SSE-NEXT: movdqa 16(%rdx), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: movdqa 16(%rdx), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: por %xmm3, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] +; SSE-NEXT: pand %xmm3, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] ; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,2,3,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm4, 32(%rcx) -; SSE-NEXT: movdqa %xmm0, 80(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) +; SSE-NEXT: pandn %xmm10, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,2,2] +; SSE-NEXT: pandn %xmm10, %xmm3 +; SSE-NEXT: por %xmm12, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: por %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm11, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, 32(%rcx) +; SSE-NEXT: movdqa %xmm6, 80(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: movdqa %xmm3, 16(%rcx) -; SSE-NEXT: movdqa %xmm1, 48(%rcx) +; SSE-NEXT: movdqa %xmm8, 48(%rcx) ; SSE-NEXT: movdqa %xmm5, 64(%rcx) ; SSE-NEXT: retq ; @@ -395,25 +395,25 @@ ; AVX1-NEXT: vmovdqa (%rdx), %xmm5 ; AVX1-NEXT: vmovdqa 16(%rdx), %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0],xmm2[1,2],xmm7[3],xmm2[4,5],xmm7[6],xmm2[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1,2],xmm7[3],xmm2[4,5],xmm7[6],xmm2[7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,2] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm2[2],xmm7[3,4],xmm2[5],xmm7[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0],xmm2[1,2],xmm7[3],xmm2[4,5],xmm7[6],xmm2[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1,2],xmm8[3],xmm7[4,5],xmm8[6],xmm7[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm2[1],xmm7[2,3],xmm2[4],xmm7[5,6],xmm2[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,0,0,0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm2[2],xmm7[3,4],xmm2[5],xmm7[6,7] +; AVX1-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[2,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6],xmm10[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX1-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,0,0,0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,0,0,0] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] @@ -422,10 +422,10 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7] ; AVX1-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX1-NEXT: vmovdqa %xmm1, 48(%rcx) -; AVX1-NEXT: vmovdqa %xmm2, (%rcx) -; AVX1-NEXT: vmovdqa %xmm11, 80(%rcx) -; AVX1-NEXT: vmovdqa %xmm10, 16(%rcx) -; AVX1-NEXT: vmovdqa %xmm8, 64(%rcx) +; AVX1-NEXT: vmovdqa %xmm10, (%rcx) +; AVX1-NEXT: vmovdqa %xmm8, 80(%rcx) +; AVX1-NEXT: vmovdqa %xmm7, 16(%rcx) +; AVX1-NEXT: vmovdqa %xmm2, 64(%rcx) ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: vf16: @@ -475,7 +475,7 @@ ; ; AVX2-FAST-ALL-LABEL: vf16: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm8 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rsi), %xmm3 @@ -483,43 +483,43 @@ ; AVX2-FAST-ALL-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3,4],xmm5[5],xmm1[6,7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7] ; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm3, %ymm3 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-ALL-NEXT: vpshufb %xmm4, %xmm2, %xmm3 -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm5, %ymm5 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,2] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm3, %ymm3 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-ALL-NEXT: vpermd (%rdi), %ymm3, %ymm3 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-ALL-NEXT: vpermd (%rdi), %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u> ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, (%rcx) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 64(%rcx) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, 64(%rcx) ; AVX2-FAST-ALL-NEXT: vzeroupper ; AVX2-FAST-ALL-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm3 @@ -527,37 +527,37 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3,4],xmm5[5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u> ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -591,98 +591,101 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: vf32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa 32(%rdi), %xmm11 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa 32(%rdi), %xmm4 ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa 32(%rsi), %xmm6 -; SSE-NEXT: movdqa 48(%rsi), %xmm10 -; SSE-NEXT: movdqa 32(%rdx), %xmm9 -; SSE-NEXT: movdqa 48(%rdx), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rsi), %xmm8 +; SSE-NEXT: movdqa 48(%rsi), %xmm11 +; SSE-NEXT: movdqa 32(%rdx), %xmm10 +; SSE-NEXT: movdqa 48(%rdx), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,2] -; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm0, %xmm9 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,2,2] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm5, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] ; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm4, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm5, %xmm13 +; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] ; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%rdx), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] +; SSE-NEXT: movdqa 16(%rdx), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] ; SSE-NEXT: movdqa %xmm2, %xmm14 ; SSE-NEXT: pandn %xmm1, %xmm14 ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa (%rsi), %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm5, %xmm15 +; SSE-NEXT: por %xmm3, %xmm15 +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,2,2] +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] ; SSE-NEXT: pand %xmm5, %xmm3 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 @@ -690,154 +693,154 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,2,2] ; SSE-NEXT: pandn %xmm12, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,0,0] -; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,0,0] +; SSE-NEXT: pandn %xmm12, %xmm5 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm10 -; SSE-NEXT: por %xmm7, %xmm10 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,1,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,1,0,4,5,6,7] ; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] ; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm11, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] ; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: por %xmm4, %xmm12 ; SSE-NEXT: movdqa %xmm12, 32(%rcx) -; SSE-NEXT: movdqa %xmm6, 80(%rcx) +; SSE-NEXT: movdqa %xmm1, 80(%rcx) ; SSE-NEXT: movdqa %xmm0, 128(%rcx) -; SSE-NEXT: movdqa %xmm10, 176(%rcx) +; SSE-NEXT: movdqa %xmm11, 176(%rcx) ; SSE-NEXT: movdqa %xmm5, (%rcx) ; SSE-NEXT: movdqa %xmm2, 16(%rcx) -; SSE-NEXT: movdqa %xmm4, 48(%rcx) +; SSE-NEXT: movdqa %xmm15, 48(%rcx) ; SSE-NEXT: movdqa %xmm14, 64(%rcx) -; SSE-NEXT: movdqa %xmm9, 96(%rcx) -; SSE-NEXT: movdqa %xmm13, 112(%rcx) -; SSE-NEXT: movdqa %xmm15, 144(%rcx) +; SSE-NEXT: movdqa %xmm13, 96(%rcx) +; SSE-NEXT: movdqa %xmm10, 112(%rcx) +; SSE-NEXT: movdqa %xmm9, 144(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rcx) ; SSE-NEXT: retq ; ; AVX1-LABEL: vf32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm12 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm15 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] -; AVX1-NEXT: vmovdqa (%rsi), %xmm13 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX1-NEXT: vmovdqa 48(%rsi), %xmm1 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] -; AVX1-NEXT: vmovdqa 16(%rdx), %xmm11 -; AVX1-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX1-NEXT: vmovdqa (%rdi), %xmm14 +; AVX1-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,2,2] +; AVX1-NEXT: vmovdqa (%rsi), %xmm2 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX1-NEXT: vmovdqa 48(%rsi), %xmm12 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] +; AVX1-NEXT: vmovdqa 16(%rdx), %xmm8 +; AVX1-NEXT: vmovdqa 32(%rdx), %xmm13 ; AVX1-NEXT: vmovdqa 48(%rdx), %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1,2],xmm7[3],xmm3[4,5],xmm7[6],xmm3[7] -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,2] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3,4],xmm7[5],xmm3[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1,2],xmm7[3],xmm3[4,5],xmm7[6],xmm3[7] -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3,4],xmm7[5],xmm3[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[1,1,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1,2],xmm7[3],xmm3[4,5],xmm7[6],xmm3[7] -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[1,1,2,2] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1],xmm7[2],xmm3[3,4],xmm7[5],xmm3[6,7] -; AVX1-NEXT: vmovdqa (%rdx), %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7] -; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX1-NEXT: vpshufb %xmm10, %xmm7, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] -; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2,3],xmm14[4],xmm5[5,6],xmm14[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX1-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7] +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,2] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6],xmm7[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX1-NEXT: vpshufb %xmm10, %xmm7, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] -; AVX1-NEXT: vmovdqa %xmm3, (%rcx) -; AVX1-NEXT: vmovdqa %xmm6, 32(%rcx) -; AVX1-NEXT: vmovdqa %xmm4, 48(%rcx) -; AVX1-NEXT: vmovdqa %xmm2, 80(%rcx) -; AVX1-NEXT: vmovdqa %xmm1, 96(%rcx) +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7] +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,2] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm11[2],xmm6[3,4],xmm11[5],xmm6[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[1,1,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm6[1,2],xmm11[3],xmm6[4,5],xmm11[6],xmm6[7] +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[1,1,2,2] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3,4],xmm14[5],xmm11[6,7] +; AVX1-NEXT: vmovdqa (%rdx), %xmm15 +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[1,1,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm11[1,2],xmm14[3],xmm11[4,5],xmm14[6],xmm11[7] +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm4[2],xmm14[3,4],xmm4[5],xmm14[6,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2,3],xmm11[4],xmm4[5,6],xmm11[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX1-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,0,0,0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6],xmm10[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[0,0,0,0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[2,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6],xmm5[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[0,0,0,0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] +; AVX1-NEXT: vmovdqa %xmm2, (%rcx) +; AVX1-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX1-NEXT: vmovdqa %xmm3, 48(%rcx) +; AVX1-NEXT: vmovdqa %xmm9, 80(%rcx) +; AVX1-NEXT: vmovdqa %xmm7, 96(%rcx) ; AVX1-NEXT: vmovdqa %xmm0, 176(%rcx) -; AVX1-NEXT: vmovdqa %xmm5, 128(%rcx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm0, 144(%rcx) +; AVX1-NEXT: vmovdqa %xmm4, 128(%rcx) +; AVX1-NEXT: vmovdqa %xmm14, 144(%rcx) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm0, 16(%rcx) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -850,81 +853,81 @@ ; ; AVX2-SLOW-LABEL: vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm14 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm5 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <5,5,u,6,6,u,7,7> -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm13, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <5,5,u,6,6,u,7,7> +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm5, %ymm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3,4],xmm7[5],xmm1[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpermd %ymm14, %ymm13, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4],xmm7[5],xmm0[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-SLOW-NEXT: vpermd %ymm14, %ymm4, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm7, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3,4],xmm1[5],xmm7[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <2,u,3,3,u,4,4,u> -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm9, %ymm3 -; AVX2-SLOW-NEXT: vpermd (%rdi), %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpermd %ymm14, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3,4],xmm12[5],xmm9[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm7, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3,4],xmm12[5],xmm9[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm9, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,3,3,u,4,4,u> +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm10, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpermd (%rdi), %ymm9, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 128(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm12, 64(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, 160(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 96(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 64(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -940,68 +943,68 @@ ; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] ; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] ; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm11, %ymm7 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm4, %ymm7 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm9 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 ; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rsi), %xmm1 -; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqa 48(%rsi), %xmm9 +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm9, %xmm9 ; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3,4],xmm1[5],xmm7[6,7] -; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm3, %xmm5 -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6,7] +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm3, %xmm9 +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %ymm10 ; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-ALL-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm11, %ymm10 -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm3, %ymm10, %ymm11 -; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm5, %ymm8 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm9, %ymm4 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm11, %xmm7, %xmm7 ; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] -; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm5, %ymm3 -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm6, %ymm4 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-ALL-NEXT: vpermd 32(%rdi), %ymm5, %ymm6 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm6, %ymm3 -; AVX2-FAST-ALL-NEXT: vpermd (%rdi), %ymm5, %ymm5 -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm5, %ymm6 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm9, %ymm5 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-ALL-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-ALL-NEXT: vpermd 32(%rdi), %ymm7, %ymm8 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm8 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm5, %ymm8, %ymm5 +; AVX2-FAST-ALL-NEXT: vpermd (%rdi), %ymm7, %ymm7 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = <2,u,3,3,u,4,4,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm7, %ymm8 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm9, %ymm5, %ymm0, %ymm0 ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, 128(%rcx) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm6, 128(%rcx) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, 64(%rcx) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm11, 96(%rcx) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 160(%rcx) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, 96(%rcx) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, 160(%rcx) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-ALL-NEXT: vzeroupper ; AVX2-FAST-ALL-NEXT: retq ; @@ -1017,68 +1020,68 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm11, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3,4],xmm1[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm10, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm5, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm9, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm5, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm5, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm9, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm7, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <2,u,3,3,u,4,4,u> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm7, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 128(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll @@ -276,148 +276,148 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rsi), %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm3 +; SSE-NEXT: movdqa 16(%rsi), %xmm6 +; SSE-NEXT: movdqa (%rdx), %xmm7 ; SSE-NEXT: movdqa 16(%rdx), %xmm4 -; SSE-NEXT: movdqa (%rcx), %xmm6 +; SSE-NEXT: movdqa (%rcx), %xmm8 ; SSE-NEXT: movdqa 16(%rcx), %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movdqa %xmm1, 96(%r8) -; SSE-NEXT: movdqa %xmm3, 112(%r8) -; SSE-NEXT: movdqa %xmm6, 64(%r8) -; SSE-NEXT: movdqa %xmm7, 80(%r8) +; SSE-NEXT: movdqa %xmm6, 112(%r8) +; SSE-NEXT: movdqa %xmm8, 64(%r8) +; SSE-NEXT: movdqa %xmm10, 80(%r8) ; SSE-NEXT: movdqa %xmm0, 32(%r8) ; SSE-NEXT: movdqa %xmm5, 48(%r8) ; SSE-NEXT: movdqa %xmm2, (%r8) -; SSE-NEXT: movdqa %xmm10, 16(%r8) +; SSE-NEXT: movdqa %xmm3, 16(%r8) ; SSE-NEXT: retq ; ; AVX1-LABEL: vf16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rcx), %xmm8 +; AVX1-NEXT: vmovdqa (%rcx), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rcx), %xmm5 -; AVX1-NEXT: vmovdqa (%rdx), %xmm9 +; AVX1-NEXT: vmovdqa (%rdx), %xmm1 ; AVX1-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 ; AVX1-NEXT: vmovdqa (%rsi), %xmm2 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm7 ; AVX1-NEXT: vmovdqa (%rdi), %xmm4 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm1 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4],ymm3[5],ymm9[6],ymm3[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] -; AVX1-NEXT: vmovaps %ymm2, (%r8) -; AVX1-NEXT: vmovaps %ymm0, 96(%r8) -; AVX1-NEXT: vmovaps %ymm1, 64(%r8) -; AVX1-NEXT: vmovaps %ymm11, 32(%r8) +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-NEXT: vmovaps %ymm0, (%r8) +; AVX1-NEXT: vmovaps %ymm5, 96(%r8) +; AVX1-NEXT: vmovaps %ymm9, 64(%r8) +; AVX1-NEXT: vmovaps %ymm3, 32(%r8) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-NEXT: vmovdqa 16(%rcx), %xmm5 -; AVX2-NEXT: vmovdqa (%rdx), %xmm9 +; AVX2-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm3 ; AVX2-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-NEXT: vmovdqa 16(%rsi), %xmm7 ; AVX2-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4],ymm3[5],ymm9[6],ymm3[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] -; AVX2-NEXT: vmovdqa %ymm2, (%r8) -; AVX2-NEXT: vmovdqa %ymm0, 96(%r8) -; AVX2-NEXT: vmovdqa %ymm1, 64(%r8) -; AVX2-NEXT: vmovdqa %ymm11, 32(%r8) +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-NEXT: vmovdqa %ymm5, 96(%r8) +; AVX2-NEXT: vmovdqa %ymm9, 64(%r8) +; AVX2-NEXT: vmovdqa %ymm3, 32(%r8) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -453,297 +453,293 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind { ; SSE-LABEL: vf32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rdi), %xmm13 -; SSE-NEXT: movdqa 32(%rdi), %xmm8 -; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: movdqa (%rsi), %xmm3 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 +; SSE-NEXT: movdqa 32(%rdi), %xmm4 +; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 ; SSE-NEXT: movdqa 32(%rsi), %xmm9 -; SSE-NEXT: movdqa (%rdx), %xmm0 -; SSE-NEXT: movdqa 16(%rdx), %xmm5 -; SSE-NEXT: movdqa 32(%rdx), %xmm6 -; SSE-NEXT: movdqa (%rcx), %xmm11 +; SSE-NEXT: movdqa (%rdx), %xmm7 +; SSE-NEXT: movdqa 16(%rdx), %xmm13 +; SSE-NEXT: movdqa 32(%rdx), %xmm10 +; SSE-NEXT: movdqa (%rcx), %xmm8 ; SSE-NEXT: movdqa 16(%rcx), %xmm14 ; SSE-NEXT: movdqa 32(%rcx), %xmm12 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa 48(%rdx), %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; SSE-NEXT: movdqa 48(%rdx), %xmm15 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] ; SSE-NEXT: movdqa 48(%rcx), %xmm12 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; SSE-NEXT: movdqa 48(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm4, 224(%r8) -; SSE-NEXT: movdqa %xmm2, 240(%r8) +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; SSE-NEXT: movdqa 48(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] +; SSE-NEXT: movdqa %xmm2, 224(%r8) +; SSE-NEXT: movdqa %xmm1, 240(%r8) ; SSE-NEXT: movdqa %xmm3, 192(%r8) -; SSE-NEXT: movdqa %xmm1, 208(%r8) -; SSE-NEXT: movdqa %xmm8, 160(%r8) +; SSE-NEXT: movdqa %xmm0, 208(%r8) +; SSE-NEXT: movdqa %xmm4, 160(%r8) ; SSE-NEXT: movdqa %xmm9, 176(%r8) -; SSE-NEXT: movdqa %xmm5, 128(%r8) +; SSE-NEXT: movdqa %xmm13, 128(%r8) ; SSE-NEXT: movdqa %xmm14, 144(%r8) -; SSE-NEXT: movdqa %xmm13, 96(%r8) -; SSE-NEXT: movdqa %xmm11, 112(%r8) +; SSE-NEXT: movdqa %xmm11, 96(%r8) +; SSE-NEXT: movdqa %xmm8, 112(%r8) ; SSE-NEXT: movdqa %xmm7, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movdqa %xmm10, 32(%r8) +; SSE-NEXT: movdqa %xmm5, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movdqa %xmm15, (%r8) +; SSE-NEXT: movdqa %xmm6, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: retq ; ; AVX1-LABEL: vf32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rcx), %xmm12 -; AVX1-NEXT: vmovdqa 16(%rcx), %xmm15 -; AVX1-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX1-NEXT: vmovdqa 48(%rcx), %xmm11 -; AVX1-NEXT: vmovdqa (%rdx), %xmm13 +; AVX1-NEXT: vmovdqa (%rcx), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX1-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX1-NEXT: vmovdqa 48(%rcx), %xmm8 +; AVX1-NEXT: vmovdqa (%rdx), %xmm1 ; AVX1-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX1-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX1-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX1-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX1-NEXT: vmovdqa 48(%rdx), %xmm11 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm8 -; AVX1-NEXT: vmovdqa (%rsi), %xmm14 -; AVX1-NEXT: vmovdqa 48(%rsi), %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 +; AVX1-NEXT: vmovdqa (%rsi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rsi), %xmm12 ; AVX1-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm8 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm7[0],zero,xmm7[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm14, %ymm7 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm14, %ymm7 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm15 +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2],ymm7[3],ymm14[4],ymm7[5],ymm14[6],ymm7[7] +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm13 +; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm10[0],zero,xmm10[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 +; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2],ymm10[3],ymm14[4],ymm10[5],ymm14[6],ymm10[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm6[0],zero,xmm6[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] -; AVX1-NEXT: vmovaps %ymm2, (%r8) -; AVX1-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-NEXT: vmovaps %ymm3, 160(%r8) +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-NEXT: vmovaps %ymm0, (%r8) +; AVX1-NEXT: vmovaps %ymm4, 96(%r8) +; AVX1-NEXT: vmovaps %ymm10, 64(%r8) +; AVX1-NEXT: vmovaps %ymm9, 160(%r8) ; AVX1-NEXT: vmovaps %ymm11, 128(%r8) ; AVX1-NEXT: vmovaps %ymm8, 224(%r8) -; AVX1-NEXT: vmovaps %ymm9, 192(%r8) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-NEXT: vmovaps %ymm7, 192(%r8) +; AVX1-NEXT: vmovaps %ymm3, 32(%r8) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rcx), %xmm12 -; AVX2-NEXT: vmovdqa 16(%rcx), %xmm15 -; AVX2-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX2-NEXT: vmovdqa 48(%rcx), %xmm11 -; AVX2-NEXT: vmovdqa (%rdx), %xmm13 +; AVX2-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX2-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-NEXT: vmovdqa 48(%rcx), %xmm8 +; AVX2-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX2-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX2-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-NEXT: vmovdqa 48(%rdx), %xmm11 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm8 -; AVX2-NEXT: vmovdqa (%rsi), %xmm14 -; AVX2-NEXT: vmovdqa 48(%rsi), %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm3 +; AVX2-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-NEXT: vmovdqa 48(%rsi), %xmm12 ; AVX2-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm8 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm7[0],zero,xmm7[1],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm14, %ymm7 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm14, %ymm7 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 +; AVX2-NEXT: vmovdqa 32(%rsi), %xmm15 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2],ymm7[3],ymm14[4],ymm7[5],ymm14[6],ymm7[7] +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm13, %ymm12 +; AVX2-NEXT: vmovdqa 16(%rsi), %xmm13 +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm10[0],zero,xmm10[1],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm14, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm14, %ymm10 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2],ymm10[3],ymm14[4],ymm10[5],ymm14[6],ymm10[7] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm6[0],zero,xmm6[1],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm12, %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX2-NEXT: vmovdqa 16(%rsi), %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] -; AVX2-NEXT: vmovdqa %ymm2, (%r8) -; AVX2-NEXT: vmovdqa %ymm1, 96(%r8) -; AVX2-NEXT: vmovdqa %ymm0, 64(%r8) -; AVX2-NEXT: vmovdqa %ymm3, 160(%r8) +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-NEXT: vmovdqa %ymm4, 96(%r8) +; AVX2-NEXT: vmovdqa %ymm10, 64(%r8) +; AVX2-NEXT: vmovdqa %ymm9, 160(%r8) ; AVX2-NEXT: vmovdqa %ymm11, 128(%r8) ; AVX2-NEXT: vmovdqa %ymm8, 224(%r8) -; AVX2-NEXT: vmovdqa %ymm9, 192(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-NEXT: vmovdqa %ymm7, 192(%r8) +; AVX2-NEXT: vmovdqa %ymm3, 32(%r8) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -294,91 +294,91 @@ define void @vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: movdqa (%rsi), %xmm11 -; SSE-NEXT: movdqa (%rdx), %xmm10 -; SSE-NEXT: movdqa (%rcx), %xmm12 -; SSE-NEXT: movdqa (%r8), %xmm8 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa (%rsi), %xmm7 +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa (%rcx), %xmm3 +; SSE-NEXT: movdqa (%r8), %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: pand %xmm8, %xmm11 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: por %xmm11, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; SSE-NEXT: psrlq $48, %xmm11 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm11[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] -; SSE-NEXT: pandn %xmm7, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, 16(%r9) -; SSE-NEXT: movdqa %xmm1, 48(%r9) -; SSE-NEXT: movdqa %xmm3, 64(%r9) +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,1] +; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE-NEXT: psrlq $48, %xmm7 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm7[1] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,3,3] +; SSE-NEXT: pand %xmm7, %xmm12 +; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, 16(%r9) +; SSE-NEXT: movdqa %xmm4, 48(%r9) +; SSE-NEXT: movdqa %xmm1, 64(%r9) ; SSE-NEXT: movdqa %xmm0, (%r9) ; SSE-NEXT: movdqa %xmm5, 32(%r9) ; SSE-NEXT: retq @@ -392,36 +392,36 @@ ; AVX1-NEXT: vmovdqa (%r8), %xmm1 ; AVX1-NEXT: vpsrlq $48, %xmm3, %xmm2 ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm2[1] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,7,6] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2],xmm2[3,4],xmm7[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3,4],xmm7[5,6,7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,1,3,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,1,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6],xmm7[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3],xmm6[4,5],xmm2[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3,4,5],xmm7[6],xmm2[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,3,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3],xmm9[4,5,6],xmm7[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm9[4],xmm7[5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3],xmm10[4,5],xmm8[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm0[4],xmm9[5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2,3],xmm6[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1,2,3],xmm9[4,5],xmm4[6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm1[3],xmm4[4,5,6,7] -; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] @@ -430,9 +430,9 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5],xmm0[6,7] ; AVX1-NEXT: vmovdqa %xmm0, 48(%r9) ; AVX1-NEXT: vmovdqa %xmm4, 32(%r9) -; AVX1-NEXT: vmovdqa %xmm2, 16(%r9) -; AVX1-NEXT: vmovdqa %xmm10, (%r9) -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm1[2],xmm8[3,4,5,6],xmm1[7] +; AVX1-NEXT: vmovdqa %xmm8, 16(%r9) +; AVX1-NEXT: vmovdqa %xmm7, (%r9) +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7] ; AVX1-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX1-NEXT: retq ; @@ -519,324 +519,323 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: vf16: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rsi), %xmm8 +; SSE-NEXT: movdqa (%rsi), %xmm15 +; SSE-NEXT: movdqa 16(%rsi), %xmm13 ; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa (%rcx), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rcx), %xmm13 -; SSE-NEXT: movdqa 16(%r8), %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa (%rcx), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rcx), %xmm11 +; SSE-NEXT: movdqa 16(%r8), %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm9, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm7, %xmm12 +; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: por %xmm12, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: pandn %xmm7, %xmm14 +; SSE-NEXT: movdqa (%rdx), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[1,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm15 +; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa (%r8), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE-NEXT: psrlq $48, %xmm13 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm13[1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm13 +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,1,3,3] +; SSE-NEXT: pand %xmm2, %xmm15 +; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: pand %xmm9, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: pandn %xmm12, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm14[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: por %xmm15, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: pand %xmm15, %xmm12 +; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,0,1] -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: por %xmm6, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: pandn %xmm10, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: pand %xmm11, %xmm5 -; SSE-NEXT: por %xmm5, %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm11, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: psrlq $48, %xmm11 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm11[1] +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm4[1] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] -; SSE-NEXT: pandn %xmm4, %xmm14 -; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,6,6] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: por %xmm11, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,3,2,3] +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm15 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] ; SSE-NEXT: pandn %xmm0, %xmm12 ; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,1,2,2,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: pand %xmm11, %xmm13 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: por %xmm13, %xmm11 -; SSE-NEXT: movdqa %xmm11, (%r9) +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm1, (%r9) ; SSE-NEXT: movdqa %xmm12, 16(%r9) -; SSE-NEXT: movdqa %xmm9, 48(%r9) -; SSE-NEXT: movdqa %xmm14, 64(%r9) -; SSE-NEXT: movdqa %xmm10, 80(%r9) -; SSE-NEXT: movdqa %xmm15, 96(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%r9) +; SSE-NEXT: movdqa %xmm15, 48(%r9) +; SSE-NEXT: movdqa %xmm9, 64(%r9) +; SSE-NEXT: movdqa %xmm7, 80(%r9) +; SSE-NEXT: movdqa %xmm13, 96(%r9) +; SSE-NEXT: movdqa %xmm14, 128(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r9) -; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: vf16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rcx), %xmm13 +; AVX1-NEXT: vmovdqa (%rcx), %xmm1 ; AVX1-NEXT: vmovdqa 16(%rcx), %xmm7 -; AVX1-NEXT: vmovdqa (%rdx), %xmm9 -; AVX1-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-NEXT: vmovdqa (%rdx), %xmm5 +; AVX1-NEXT: vmovdqa 16(%rdx), %xmm9 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX1-NEXT: vandnps %ymm0, %ymm10, %ymm6 -; AVX1-NEXT: vmovdqa (%rdi), %xmm14 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX1-NEXT: vmovdqa (%rsi), %xmm15 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm2[1] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-NEXT: vandps %ymm2, %ymm10, %ymm2 -; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] +; AVX1-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX1-NEXT: vmovdqa (%rsi), %xmm4 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm12 +; AVX1-NEXT: vpsrlq $48, %xmm12, %xmm3 +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm10[1],xmm3[1] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3 +; AVX1-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 ; AVX1-NEXT: vmovdqa 16(%r8), %xmm11 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[2,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm4[2],xmm8[3,4,5,6],xmm4[7] -; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0],xmm2[1,2,3,4],xmm4[5],xmm2[6,7] -; AVX1-NEXT: vpsrlq $48, %xmm15, %xmm2 -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm14[1],xmm2[1] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,3,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5,7,6] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-NEXT: vandps %ymm5, %ymm10, %ymm5 -; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,6,6] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,2,2] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4,5,6],xmm4[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6,7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3],xmm1[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3,4,5,6],xmm8[7] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1,2,3,4],xmm8[5],xmm3[6,7] +; AVX1-NEXT: vpsrlq $48, %xmm4, %xmm8 +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm2[1],xmm8[1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,3,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm14 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,5,7,6] +; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,3,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,3,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,1] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-NEXT: vmovdqa (%r8), %xmm4 -; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,1,2,2] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4,5,6],xmm6[7] -; AVX1-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm14[4],xmm6[5,6,7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-NEXT: vandnps %ymm5, %ymm6, %ymm5 -; AVX1-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4,5,6],xmm6[7] -; AVX1-NEXT: vorps %ymm5, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4],xmm6[5],xmm5[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm3, 32(%r9) -; AVX1-NEXT: vmovdqa %xmm5, 48(%r9) +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm0 +; AVX1-NEXT: vandnps %ymm14, %ymm6, %ymm14 +; AVX1-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm0, %ymm14, %ymm6 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5,6,6] +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm12 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0,1],xmm13[2],xmm10[3,4,5,6],xmm13[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX1-NEXT: vandnps %ymm12, %ymm15, %ymm7 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm15, %ymm0 +; AVX1-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3],xmm7[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm14[1],xmm0[2,3,4,5],xmm14[6],xmm0[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,1,3,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,6] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 +; AVX1-NEXT: vmovdqa (%r8), %xmm12 +; AVX1-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-NEXT: vandps %ymm15, %ymm11, %ymm11 +; AVX1-NEXT: vorps %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm11 +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1],xmm11[2,3,4,5],xmm13[6],xmm11[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4,5,6],xmm1[7] +; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm4[2],xmm6[3,4,5,6],xmm4[7] +; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3,4],xmm4[5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[3],xmm1[4,5,6,7] +; AVX1-NEXT: vmovdqa %xmm1, 32(%r9) +; AVX1-NEXT: vmovdqa %xmm2, 48(%r9) ; AVX1-NEXT: vmovdqa %xmm0, (%r9) -; AVX1-NEXT: vmovdqa %xmm1, 16(%r9) -; AVX1-NEXT: vmovdqa %xmm7, 96(%r9) -; AVX1-NEXT: vmovdqa %xmm11, 112(%r9) -; AVX1-NEXT: vmovdqa %xmm2, 64(%r9) +; AVX1-NEXT: vmovdqa %xmm11, 16(%r9) +; AVX1-NEXT: vmovdqa %xmm9, 96(%r9) +; AVX1-NEXT: vmovdqa %xmm7, 112(%r9) +; AVX1-NEXT: vmovdqa %xmm5, 64(%r9) ; AVX1-NEXT: vmovdqa %xmm10, 80(%r9) -; AVX1-NEXT: vmovdqa %xmm8, 128(%r9) +; AVX1-NEXT: vmovdqa %xmm3, 128(%r9) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm0, 144(%r9) ; AVX1-NEXT: vzeroupper @@ -844,83 +843,83 @@ ; ; AVX2-SLOW-LABEL: vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,2,2,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2],xmm1[3],xmm6[4,5],xmm1[6],xmm6[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2],ymm1[3,4],ymm7[5,6,7,8],ymm1[9],ymm7[10],ymm1[11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5],ymm1[6],ymm7[7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13],ymm1[14],ymm7[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm8, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3,4],ymm9[5,6,7,8],ymm8[9],ymm9[10],ymm8[11,12],ymm9[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm7[2],ymm10[3],ymm7[4],ymm10[5,6],ymm7[7],ymm10[8,9],ymm7[10],ymm10[11],ymm7[12],ymm10[13,14],ymm7[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm9[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm8[1,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 96(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 128(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-SLOW-NEXT: vzeroupper @@ -928,79 +927,79 @@ ; ; AVX2-FAST-LABEL: vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm10 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,2,2,2] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,1,1,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3,4],ymm2[5,6,7,8],ymm1[9],ymm2[10],ymm1[11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3],ymm2[4],ymm7[5,6],ymm2[7],ymm7[8,9],ymm2[10],ymm7[11],ymm2[12],ymm7[13,14],ymm2[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,2,2,2] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm1[0,1,1,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,2] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3,4],ymm9[5,6,7,8],ymm8[9],ymm9[10],ymm8[11,12],ymm9[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm9[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm8[1,1,2,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 128(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm6, (%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%r9) ; AVX2-FAST-NEXT: vzeroupper @@ -1051,408 +1050,402 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $248, %rsp -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm13 -; SSE-NEXT: movdqa 16(%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm14 -; SSE-NEXT: movdqa (%rcx), %xmm11 +; SSE-NEXT: subq $232, %rsp +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm8 +; SSE-NEXT: movdqa 16(%rsi), %xmm13 +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa (%rcx), %xmm0 ; SSE-NEXT: movdqa 16(%rcx), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa (%r8), %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 +; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm11, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm9, %xmm7 ; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm10, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,2,2] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movdqa 16(%r8), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa 32(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa 32(%rcx), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movdqa 32(%rdx), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,2,2] -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: movdqa 16(%rdx), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,1,2,2] +; SSE-NEXT: pand %xmm14, %xmm11 +; SSE-NEXT: por %xmm11, %xmm7 ; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: movdqa 32(%r8), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm10, %xmm7 +; SSE-NEXT: movdqa 16(%r8), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm10, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: por %xmm7, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm10, %xmm7 +; SSE-NEXT: movdqa 32(%rcx), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: movdqa 32(%rdx), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,2,2] +; SSE-NEXT: pand %xmm14, %xmm12 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: por %xmm7, %xmm11 +; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: movdqa 32(%r8), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: por %xmm7, %xmm11 +; SSE-NEXT: movdqa 48(%rcx), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: pandn %xmm7, %xmm12 +; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] +; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: movdqa 48(%r8), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] +; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: pand %xmm9, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm15[0,1,0,1] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm12 +; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,1,2,2] +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm15, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa 48(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa 48(%rcx), %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: por %xmm12, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa 48(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: movdqa 48(%r8), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm8 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 ; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,1] +; SSE-NEXT: pand %xmm3, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,1,0,1] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm13 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: por %xmm2, %xmm13 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm13 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,1] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,1,0,1] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,1,1] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,1,0,1] +; SSE-NEXT: pandn %xmm6, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,1] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,0,1] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm6, %xmm11 -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: pandn %xmm6, %xmm12 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm3[1] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm9, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: pandn %xmm6, %xmm9 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,1] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pand %xmm14, %xmm10 +; SSE-NEXT: por %xmm10, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm10[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] +; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm12, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,1] +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: por %xmm3, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] -; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: psrlq $48, %xmm7 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, 304(%r9) -; SSE-NEXT: movdqa %xmm13, 288(%r9) -; SSE-NEXT: movdqa %xmm3, 256(%r9) -; SSE-NEXT: movdqa %xmm15, 240(%r9) +; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm14, 304(%r9) +; SSE-NEXT: movdqa %xmm15, 288(%r9) +; SSE-NEXT: movdqa %xmm2, 256(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%r9) ; SSE-NEXT: movdqa %xmm9, 224(%r9) -; SSE-NEXT: movdqa %xmm12, 208(%r9) +; SSE-NEXT: movdqa %xmm7, 208(%r9) ; SSE-NEXT: movdqa %xmm11, 176(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%r9) +; SSE-NEXT: movdqa %xmm13, 160(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r9) @@ -1470,118 +1463,118 @@ ; SSE-NEXT: movaps %xmm0, 112(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) -; SSE-NEXT: addq $248, %rsp +; SSE-NEXT: addq $232, %rsp ; SSE-NEXT: retq ; ; AVX1-LABEL: vf32: ; AVX1: # %bb.0: ; AVX1-NEXT: subq $72, %rsp -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX1-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX1-NEXT: vmovdqa 48(%rsi), %xmm6 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5,6,6] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-NEXT: vandnps %ymm0, %ymm12, %ymm1 -; AVX1-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX1-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX1-NEXT: vandnps %ymm0, %ymm14, %ymm1 +; AVX1-NEXT: vmovdqa 32(%rdx), %xmm11 ; AVX1-NEXT: vmovdqa 48(%rdx), %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] -; AVX1-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX1-NEXT: vmovdqa 48(%rcx), %xmm7 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5,6],xmm4[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-NEXT: vandps %ymm2, %ymm12, %ymm2 -; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm10 -; AVX1-NEXT: vmovdqa 48(%r8), %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; AVX1-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,6] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 -; AVX1-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX1-NEXT: vandnps %ymm4, %ymm14, %ymm4 -; AVX1-NEXT: vpsrlq $48, %xmm5, %xmm2 -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX1-NEXT: vmovdqa 48(%rcx), %xmm4 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3,4,5,6],xmm7[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 ; AVX1-NEXT: vandps %ymm2, %ymm14, %ymm2 -; AVX1-NEXT: vorps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm2[1,2,3,4],xmm3[5],xmm2[6,7] -; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7] -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,2,2] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm11[4],xmm4[5,6,7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] +; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovdqa 48(%r8), %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,6] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] +; AVX1-NEXT: vandnps %ymm7, %ymm12, %ymm7 +; AVX1-NEXT: vpsrlq $48, %xmm6, %xmm8 +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm5[1],xmm8[1] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX1-NEXT: vorps %ymm2, %ymm4, %ymm5 -; AVX1-NEXT: vpsrlq $48, %xmm15, %xmm2 -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm11[1],xmm2[1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,3,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-NEXT: vandps %ymm5, %ymm12, %ymm5 +; AVX1-NEXT: vorps %ymm7, %ymm5, %ymm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1,2,3,4],xmm6[5],xmm5[6,7] +; AVX1-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4,5,6],xmm6[7] +; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,2,2] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4,5,6],xmm6[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; AVX1-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm9[4],xmm7[5,6,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-NEXT: vandnps %ymm5, %ymm8, %ymm5 +; AVX1-NEXT: vandps %ymm7, %ymm8, %ymm7 +; AVX1-NEXT: vorps %ymm5, %ymm7, %ymm5 +; AVX1-NEXT: vpsrlq $48, %xmm10, %xmm7 +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm9[1],xmm7[1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX1-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3],xmm3[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm1[1],xmm10[2,3,4,5],xmm1[6],xmm10[7] -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vandnps %ymm2, %ymm14, %ymm2 -; AVX1-NEXT: vandps %ymm0, %ymm14, %ymm0 -; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm2[3],xmm6[4,5,6,7] +; AVX1-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vandnps %ymm3, %ymm12, %ymm1 +; AVX1-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4],xmm3[5],xmm2[6,7] -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm3[2],xmm0[3,4,5,6],xmm3[7] -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4],xmm3[5],xmm1[6,7] +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm3[2],xmm0[3,4,5,6],xmm3[7] +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovdqa 16(%rdx), %xmm9 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] +; AVX1-NEXT: vmovdqa 16(%rdx), %xmm8 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,3,2,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] @@ -1590,8 +1583,8 @@ ; AVX1-NEXT: vmovdqa 16(%rcx), %xmm6 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7] ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-NEXT: vandps %ymm1, %ymm12, %ymm1 +; AVX1-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-NEXT: vandps %ymm1, %ymm14, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] @@ -1599,7 +1592,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] @@ -1612,112 +1605,110 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-NEXT: vandnps %ymm2, %ymm14, %ymm2 -; AVX1-NEXT: vandps %ymm4, %ymm14, %ymm4 +; AVX1-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-NEXT: vandps %ymm4, %ymm12, %ymm4 ; AVX1-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-NEXT: vmovdqa 16(%r8), %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,3,2,3] +; AVX1-NEXT: vmovdqa 16(%r8), %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm2[1,2,3,4],xmm4[5],xmm2[6,7] ; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5,6],xmm4[7] ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rdi), %xmm10 -; AVX1-NEXT: vmovdqa (%rsi), %xmm13 -; AVX1-NEXT: vpsrlq $48, %xmm13, %xmm2 -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm10[1],xmm2[1] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,3,2,4,5,6,7] +; AVX1-NEXT: vmovdqa (%rdi), %xmm15 +; AVX1-NEXT: vmovdqa (%rsi), %xmm9 +; AVX1-NEXT: vpsrlq $48, %xmm9, %xmm2 +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm15[1],xmm2[1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,3,2,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vmovdqa (%rdx), %xmm3 -; AVX1-NEXT: vmovdqa (%rcx), %xmm4 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5,7,6] -; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,3,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,2,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5 -; AVX1-NEXT: vandnps %ymm2, %ymm14, %ymm2 -; AVX1-NEXT: vandps %ymm5, %ymm14, %ymm5 -; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm5 +; AVX1-NEXT: vmovdqa (%rdx), %xmm4 +; AVX1-NEXT: vmovdqa (%rcx), %xmm5 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,5,7,6] +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,3,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,0,2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 +; AVX1-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-NEXT: vandps %ymm12, %ymm11, %ymm11 +; AVX1-NEXT: vorps %ymm2, %ymm11, %ymm12 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5,6,6] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,2,2] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,2,2] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm14 +; AVX1-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-NEXT: vandps %ymm1, %ymm14, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] +; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[3],xmm2[4,5,6,7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,3,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm6, %ymm14, %ymm6 -; AVX1-NEXT: vandps %ymm0, %ymm14, %ymm0 -; AVX1-NEXT: vorps %ymm6, %ymm0, %ymm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3],xmm2[4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,3,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-NEXT: vandps %ymm6, %ymm14, %ymm6 +; AVX1-NEXT: vorps %ymm2, %ymm6, %ymm2 ; AVX1-NEXT: vmovdqa (%r8), %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm7[4],xmm0[5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3,4,5],xmm7[6],xmm0[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7] -; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm10[4],xmm4[5,6,7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1,2,3],xmm7[4],xmm2[5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3,4,5],xmm7[6],xmm2[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] +; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm15[4],xmm5[5,6,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 ; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-NEXT: vandnps %ymm3, %ymm7, %ymm3 -; AVX1-NEXT: vandps %ymm7, %ymm4, %ymm4 -; AVX1-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3],xmm3[4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4],xmm4[5],xmm3[6,7] -; AVX1-NEXT: vmovdqa %xmm3, 48(%r9) +; AVX1-NEXT: vandnps %ymm4, %ymm7, %ymm4 +; AVX1-NEXT: vandps %ymm7, %ymm5, %ymm5 +; AVX1-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm5[2],xmm12[3,4,5,6],xmm5[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] +; AVX1-NEXT: vmovdqa %xmm4, 48(%r9) ; AVX1-NEXT: vmovdqa %xmm6, 32(%r9) -; AVX1-NEXT: vmovdqa %xmm0, 16(%r9) -; AVX1-NEXT: vmovdqa %xmm1, (%r9) -; AVX1-NEXT: vmovdqa %xmm2, 112(%r9) -; AVX1-NEXT: vmovdqa %xmm11, 96(%r9) -; AVX1-NEXT: vmovdqa %xmm9, 80(%r9) -; AVX1-NEXT: vmovdqa %xmm5, 64(%r9) +; AVX1-NEXT: vmovdqa %xmm2, 16(%r9) +; AVX1-NEXT: vmovdqa %xmm8, (%r9) +; AVX1-NEXT: vmovdqa %xmm3, 112(%r9) +; AVX1-NEXT: vmovdqa %xmm1, 96(%r9) +; AVX1-NEXT: vmovdqa %xmm0, 80(%r9) +; AVX1-NEXT: vmovdqa %xmm7, 64(%r9) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm0, 144(%r9) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1748,333 +1739,335 @@ ; ; AVX2-SLOW-LABEL: vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $40, %rsp -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-SLOW-NEXT: subq $72, %rsp +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm13 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2],xmm1[3],xmm6[4,5],xmm1[6],xmm6[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm5, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0],xmm6[1],xmm10[2],xmm6[3],xmm10[4,5],xmm6[6],xmm10[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,1,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm10 -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5],xmm5[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5],xmm11[6],xmm9[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3,4],ymm7[5,6,7,8],ymm5[9],ymm7[10],ymm5[11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4,5],ymm5[6],ymm9[7,8],ymm5[9],ymm9[10],ymm5[11],ymm9[12,13],ymm5[14],ymm9[15] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm12[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2],ymm11[3,4],ymm9[5,6,7,8],ymm11[9],ymm9[10],ymm11[11,12],ymm9[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,2] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2],ymm9[3,4],ymm14[5,6,7,8],ymm9[9],ymm14[10],ymm9[11,12],ymm14[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm15, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13],ymm11[14],ymm9[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3],ymm13[4],ymm11[5,6],ymm13[7],ymm11[8,9],ymm13[10],ymm11[11],ymm13[12],ymm11[13,14],ymm13[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5],ymm11[6],ymm0[7,8],ymm11[9],ymm0[10,11],ymm11[12],ymm0[13],ymm11[14],ymm0[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3],ymm8[4],ymm11[5,6],ymm8[7],ymm11[8,9],ymm8[10],ymm11[11],ymm8[12],ymm11[13,14],ymm8[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm9, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm9, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm15, %ymm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm15, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm4[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm14, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm10[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2],ymm4[3,4],ymm15[5,6,7,8],ymm4[9],ymm15[10],ymm4[11,12],ymm15[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm4, %ymm9 +; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5],ymm15[6],ymm4[7,8],ymm15[9],ymm4[10,11],ymm15[12],ymm4[13],ymm15[14],ymm4[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3],ymm5[4],ymm15[5,6],ymm5[7],ymm15[8,9],ymm5[10],ymm15[11],ymm5[12],ymm15[13,14],ymm5[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm10[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4,5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10],ymm6[11],ymm11[12,13],ymm6[14],ymm11[15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = ; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13],ymm6[14],ymm9[15] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm1[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm6, %ymm9, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm14, %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm8[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 224(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 128(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 128(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 288(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 256(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 288(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 256(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9) ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%r9) -; AVX2-SLOW-NEXT: addq $40, %rsp +; AVX2-SLOW-NEXT: addq $72, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: vf32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $40, %rsp -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm4 ; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,1,1,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm10 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,1,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm1 -; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5],xmm10[6],xmm5[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0],xmm10[1],xmm6[2],xmm10[3],xmm6[4,5],xmm10[6],xmm6[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,1,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3,4],ymm4[5,6,7,8],ymm6[9],ymm4[10],ymm6[11,12],ymm4[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm8, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm8, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm11 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10],ymm13[11],ymm11[12,13],ymm13[14],ymm11[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm14 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2],ymm15[3,4],ymm14[5,6,7,8],ymm15[9],ymm14[10],ymm15[11,12],ymm14[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm12 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7,8],ymm4[9],ymm12[10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm12 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2],ymm15[3,4],ymm5[5,6,7,8],ymm15[9],ymm5[10],ymm15[11,12],ymm5[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3,4],ymm12[5,6,7,8],ymm5[9],ymm12[10],ymm5[11,12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm5, %ymm12 +; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm14 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10,11],ymm15[12],ymm6[13],ymm15[14],ymm6[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13],ymm15[14],ymm14[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3],ymm8[4],ymm0[5,6],ymm8[7],ymm0[8,9],ymm8[10],ymm0[11],ymm8[12],ymm0[13,14],ymm8[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3],ymm15[4],ymm6[5,6],ymm15[7],ymm6[8,9],ymm15[10],ymm6[11],ymm15[12],ymm6[13,14],ymm15[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3],ymm7[4],ymm1[5,6],ymm7[7],ymm1[8,9],ymm7[10],ymm1[11],ymm7[12],ymm1[13,14],ymm7[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm14[2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3],ymm15[4],ymm7[5,6],ymm15[7],ymm7[8,9],ymm15[10],ymm7[11],ymm15[12],ymm7[13,14],ymm15[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm7, %ymm15 +; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm12[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm9, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm14[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13],ymm4[14],ymm7[15] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2],ymm1[3],ymm7[4,5],ymm1[6],ymm7[7,8],ymm1[9],ymm7[10],ymm1[11],ymm7[12,13],ymm1[14],ymm7[15] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[1,1,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm8, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 224(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 128(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 288(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 256(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 224(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 128(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 288(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm15, 256(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm8, (%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -188,18 +188,18 @@ ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm2[0],xmm1[0] ; AVX1-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm5[0],xmm4[0] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,1,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5],xmm7[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,1,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,1,2,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,2,0] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 @@ -208,7 +208,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] ; AVX1-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX1-NEXT: vmovaps %ymm1, (%rax) @@ -353,132 +353,132 @@ ; SSE-LABEL: vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa (%rsi), %xmm8 ; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa (%rcx), %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm3 +; SSE-NEXT: movdqa (%rcx), %xmm9 +; SSE-NEXT: movdqa (%r8), %xmm7 ; SSE-NEXT: movdqa (%r9), %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm9[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm6[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm11, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; SSE-NEXT: movaps %xmm11, %xmm8 -; SSE-NEXT: andnps %xmm6, %xmm8 -; SSE-NEXT: orps %xmm4, %xmm8 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: andps %xmm11, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: andnps %xmm1, %xmm11 -; SSE-NEXT: orps %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm12[0] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: andps %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5] -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: andnps %xmm6, %xmm1 -; SSE-NEXT: orps %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm1[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm0[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3] +; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,1,2,1] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: andnps %xmm11, %xmm4 +; SSE-NEXT: orps %xmm10, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm2[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm9[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] +; SSE-NEXT: andps %xmm0, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; SSE-NEXT: andnps %xmm9, %xmm0 +; SSE-NEXT: orps %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm3[0] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm8[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: andps %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: andnps %xmm11, %xmm9 +; SSE-NEXT: orps %xmm10, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm2[1] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm10[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm10, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: andnps %xmm13, %xmm0 -; SSE-NEXT: orps %xmm7, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm10[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,1],xmm10[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] -; SSE-NEXT: andps %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,1,1] +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: andnps %xmm13, %xmm11 +; SSE-NEXT: orps %xmm12, %xmm11 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm6[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[0,2] +; SSE-NEXT: andps %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; SSE-NEXT: pslld $16, %xmm5 -; SSE-NEXT: andnps %xmm5, %xmm4 -; SSE-NEXT: orps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm9[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm12[0,2] -; SSE-NEXT: andps %xmm6, %xmm7 -; SSE-NEXT: andnps %xmm3, %xmm6 -; SSE-NEXT: orps %xmm7, %xmm6 -; SSE-NEXT: movaps %xmm6, 16(%rax) -; SSE-NEXT: movaps %xmm4, 48(%rax) -; SSE-NEXT: movaps %xmm0, 64(%rax) -; SSE-NEXT: movaps %xmm1, (%rax) -; SSE-NEXT: movaps %xmm11, 80(%rax) -; SSE-NEXT: movaps %xmm8, 32(%rax) +; SSE-NEXT: andnps %xmm5, %xmm8 +; SSE-NEXT: orps %xmm2, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm3[0,2] +; SSE-NEXT: andps %xmm10, %xmm12 +; SSE-NEXT: andnps %xmm6, %xmm10 +; SSE-NEXT: orps %xmm12, %xmm10 +; SSE-NEXT: movaps %xmm10, 16(%rax) +; SSE-NEXT: movaps %xmm8, 48(%rax) +; SSE-NEXT: movaps %xmm11, 64(%rax) +; SSE-NEXT: movaps %xmm9, (%rax) +; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps %xmm4, 32(%rax) ; SSE-NEXT: retq ; ; AVX1-LABEL: vf8: ; AVX1: # %bb.0: ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: vmovdqa (%rdi), %xmm8 -; AVX1-NEXT: vmovdqa (%rsi), %xmm9 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm1 ; AVX1-NEXT: vmovdqa (%rdx), %xmm2 ; AVX1-NEXT: vmovdqa (%rcx), %xmm3 -; AVX1-NEXT: vmovdqa (%r8), %xmm11 +; AVX1-NEXT: vmovdqa (%r8), %xmm4 ; AVX1-NEXT: vmovdqa (%r9), %xmm5 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5],xmm0[6,7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5],xmm0[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm6[4,5],xmm8[6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5],xmm0[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm6[1],xmm5[1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3,4,5],xmm1[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,2,3,3] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm0[1],xmm4[1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3,4,5],xmm5[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5],xmm2[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-NEXT: vmovaps %ymm10, (%rax) +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-NEXT: vmovaps %ymm8, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -648,173 +648,173 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm11 ; SSE-NEXT: movdqa (%rsi), %xmm15 -; SSE-NEXT: movdqa 16(%rsi), %xmm4 -; SSE-NEXT: movdqa (%rdx), %xmm8 -; SSE-NEXT: movdqa 16(%rdx), %xmm1 -; SSE-NEXT: movdqa (%rcx), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rcx), %xmm6 -; SSE-NEXT: movdqa 16(%r8), %xmm3 -; SSE-NEXT: movdqa 16(%r9), %xmm13 -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm7[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; SSE-NEXT: movaps %xmm12, %xmm2 -; SSE-NEXT: andnps %xmm5, %xmm2 -; SSE-NEXT: orps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm5[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; SSE-NEXT: movaps %xmm12, %xmm2 -; SSE-NEXT: andnps %xmm5, %xmm2 -; SSE-NEXT: orps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm4[3,3] -; SSE-NEXT: movdqa (%r8), %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm5[2,3] -; SSE-NEXT: movdqa (%r9), %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: andnps %xmm5, %xmm0 +; SSE-NEXT: movdqa 16(%rsi), %xmm6 +; SSE-NEXT: movdqa (%rdx), %xmm5 +; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: movdqa (%rcx), %xmm14 +; SSE-NEXT: movdqa 16(%rcx), %xmm8 +; SSE-NEXT: movdqa 16(%r8), %xmm9 +; SSE-NEXT: movdqa 16(%r9), %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm7[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm2, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: andnps %xmm13, %xmm4 +; SSE-NEXT: orps %xmm12, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm0[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm8[0,1] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] -; SSE-NEXT: andps %xmm12, %xmm6 -; SSE-NEXT: orps %xmm6, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm8[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm6[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] -; SSE-NEXT: andps %xmm12, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm15[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; SSE-NEXT: andnps %xmm6, %xmm12 -; SSE-NEXT: orps %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm6[0,2] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: andps %xmm2, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: andnps %xmm8, %xmm4 +; SSE-NEXT: orps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm12[3,3] +; SSE-NEXT: movdqa (%r8), %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm6[2,3] +; SSE-NEXT: movdqa (%r9), %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: andnps %xmm4, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] +; SSE-NEXT: andps %xmm2, %xmm15 +; SSE-NEXT: orps %xmm15, %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm5[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm8[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm14[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] +; SSE-NEXT: andps %xmm2, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; SSE-NEXT: andnps %xmm14, %xmm2 +; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm4[0,2] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm4, %xmm14 +; SSE-NEXT: andps %xmm15, %xmm11 +; SSE-NEXT: por %xmm11, %xmm14 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm3[0] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,1],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[0,2] +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: pslld $16, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm11, %xmm4 +; SSE-NEXT: andps %xmm3, %xmm7 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm7[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: andps %xmm15, %xmm11 +; SSE-NEXT: por %xmm11, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[0,2] +; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pandn %xmm10, %xmm9 +; SSE-NEXT: andps %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: andps %xmm6, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm9[0] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm9[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[0,2] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pslld $16, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm5, %xmm10 -; SSE-NEXT: andps %xmm9, %xmm7 -; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,0,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: andps %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm14[0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[0,2] -; SSE-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm13, %xmm3 -; SSE-NEXT: andps %xmm9, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: andps %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm13[0] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm13[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm1[0,2] ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: andps %xmm6, %xmm5 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: andps %xmm9, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] -; SSE-NEXT: andps %xmm6, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1] -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm11[0,2] -; SSE-NEXT: andps %xmm9, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm15[0,1,2,3,4,5] -; SSE-NEXT: pandn %xmm15, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: andps %xmm3, %xmm12 +; SSE-NEXT: por %xmm12, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm1[0,2] +; SSE-NEXT: andps %xmm15, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: por %xmm11, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm12[0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm12[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[0,2] +; SSE-NEXT: andps %xmm3, %xmm5 +; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5] +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm9, (%rax) -; SSE-NEXT: movdqa %xmm6, 16(%rax) -; SSE-NEXT: movdqa %xmm5, 48(%rax) -; SSE-NEXT: movdqa %xmm1, 64(%rax) -; SSE-NEXT: movdqa %xmm3, 96(%rax) +; SSE-NEXT: movdqa %xmm3, (%rax) +; SSE-NEXT: movdqa %xmm15, 16(%rax) +; SSE-NEXT: movdqa %xmm10, 48(%rax) +; SSE-NEXT: movdqa %xmm0, 64(%rax) +; SSE-NEXT: movdqa %xmm9, 96(%rax) ; SSE-NEXT: movdqa %xmm7, 112(%rax) -; SSE-NEXT: movdqa %xmm10, 144(%rax) +; SSE-NEXT: movdqa %xmm4, 144(%rax) +; SSE-NEXT: movdqa %xmm14, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps %xmm12, 32(%rax) +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -825,139 +825,135 @@ ; ; AVX1-LABEL: vf16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rcx), %xmm8 +; AVX1-NEXT: vmovdqa (%rcx), %xmm2 ; AVX1-NEXT: vmovdqa 16(%rcx), %xmm0 -; AVX1-NEXT: vmovdqa (%rdx), %xmm9 +; AVX1-NEXT: vmovdqa (%rdx), %xmm3 ; AVX1-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,2,3,3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmovdqa (%rsi), %xmm15 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm2 -; AVX1-NEXT: vmovdqa (%rdi), %xmm11 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,2,3,3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-NEXT: vmovdqa (%rsi), %xmm5 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX1-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm4[4,5],xmm7[6,7] -; AVX1-NEXT: vmovdqa 16(%r9), %xmm7 -; AVX1-NEXT: vpslld $16, %xmm7, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm6[5],xmm0[6,7] -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,1,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX1-NEXT: vmovdqa 16(%r8), %xmm10 +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4,5],xmm7[6,7] +; AVX1-NEXT: vmovdqa 16(%r9), %xmm12 +; AVX1-NEXT: vpslld $16, %xmm12, %xmm11 +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm11[5],xmm7[6,7] +; AVX1-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[2,1,3,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3,4,5,6],xmm7[7] +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5],xmm4[6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,4,6,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3,4,5,6],xmm4[7] ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX1-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,3,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 +; AVX1-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] +; AVX1-NEXT: vmovdqa (%r8), %xmm15 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,5,7,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3,4,5],xmm0[6,7] +; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm11 +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm11[2,3,4,5],xmm0[6,7] ; AVX1-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5,6],xmm5[7] -; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm6[0,1,2],xmm5[3],xmm6[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[1,1,2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,0,1,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1,2],xmm6[3],xmm3[4,5,6,7] -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1],xmm4[0],xmm5[3] -; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0,1,2,3,4],xmm5[5],xmm4[6,7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm8 -; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5,6],ymm6[7] -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm3 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm6[0,1],xmm2[0],xmm6[3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm4[1],xmm11[2,3,4,5,6],xmm4[7] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0,1,2],xmm14[3],xmm4[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5,6],ymm4[7] +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,0,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7] +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm10[0],xmm4[3] +; AVX1-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0,1,2,3,4],xmm9[5],xmm4[6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm5 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0],xmm3[3] ; AVX1-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5],xmm4[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5],xmm3[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm2[4,5],xmm5[6,7] -; AVX1-NEXT: vpslld $16, %xmm0, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5],xmm5[6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm15[4,5],xmm2[6,7] +; AVX1-NEXT: vpslld $16, %xmm0, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5],xmm2[6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[2,1,3,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0],xmm1[1,2],xmm4[3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7] ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX1-NEXT: vmovdqa %xmm5, 48(%rax) -; AVX1-NEXT: vmovdqa %xmm4, (%rax) -; AVX1-NEXT: vmovdqa %xmm3, 16(%rax) -; AVX1-NEXT: vmovdqa %xmm10, 96(%rax) -; AVX1-NEXT: vmovdqa %xmm12, 112(%rax) +; AVX1-NEXT: vmovdqa %xmm2, 48(%rax) +; AVX1-NEXT: vmovdqa %xmm3, (%rax) +; AVX1-NEXT: vmovdqa %xmm5, 16(%rax) +; AVX1-NEXT: vmovdqa %xmm9, 96(%rax) +; AVX1-NEXT: vmovdqa %xmm8, 112(%rax) ; AVX1-NEXT: vmovdqa %xmm14, 64(%rax) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm0, 80(%rax) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm0, 160(%rax) +; AVX1-NEXT: vmovdqa %xmm11, 80(%rax) +; AVX1-NEXT: vmovdqa %xmm7, 160(%rax) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm0, 176(%rax) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -969,113 +965,113 @@ ; ; AVX2-SLOW-LABEL: vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm15 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm5 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm1 +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm7, %ymm9 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm9[1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm11 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm4 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm14 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2],ymm14[3,4],ymm4[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm4[1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm14, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm14 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm11[4],ymm8[5],ymm11[5],ymm8[6],ymm11[6],ymm8[7],ymm11[7],ymm8[12],ymm11[12],ymm8[13],ymm11[13],ymm8[14],ymm11[14],ymm8[15],ymm11[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm15[4],ymm9[4],ymm15[5],ymm9[5],ymm15[6],ymm9[6],ymm15[7],ymm9[7],ymm15[12],ymm9[12],ymm15[13],ymm9[13],ymm15[14],ymm9[14],ymm15[15],ymm9[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,0,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,1,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm14, %ymm15, %ymm14 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm13, %ymm15, %ymm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,0,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[8],ymm11[8],ymm8[9],ymm11[9],ymm8[10],ymm11[10],ymm8[11],ymm11[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm15[0],ymm9[0],ymm15[1],ymm9[1],ymm15[2],ymm9[2],ymm15[3],ymm9[3],ymm15[8],ymm9[8],ymm15[9],ymm9[9],ymm15[10],ymm9[10],ymm15[11],ymm9[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,2,2,5,4,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm14, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) @@ -1086,108 +1082,108 @@ ; ; AVX2-FAST-ALL-LABEL: vf16: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm9 -; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm10 -; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %ymm11 +; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %ymm2 ; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm7, %xmm5, %xmm8 ; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm7 -; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm2 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm0 -; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm1 -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,1,3,3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %xmm2 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm10 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm9 +; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm11 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm10, %ymm10 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,1,3,3,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] +; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %xmm11 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3 -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm10, %ymm12 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm3, %ymm9, %ymm3 -; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm13, %ymm7, %ymm12, %ymm7 +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-ALL-NEXT: vpshufb %ymm12, %ymm1, %ymm14 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm12, %ymm0, %ymm12 +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11] ; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm14 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] +; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm15 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] ; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %ymm12 -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm14 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm13, %ymm3, %ymm14, %ymm3 -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = <1,u,u,2,u,u,3,u> -; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm14, %ymm3 -; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,1,1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7] -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3],ymm14[4],ymm3[5,6],ymm14[7] -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm3, %ymm14, %ymm14 -; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm4[4],ymm15[5],ymm4[5],ymm15[6],ymm4[6],ymm15[7],ymm4[7],ymm15[12],ymm4[12],ymm15[13],ymm4[13],ymm15[14],ymm4[14],ymm15[15],ymm4[15] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm7 +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm15 = <1,u,u,2,u,u,3,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm14, %ymm15, %ymm14 +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm14, %ymm15, %ymm14 +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm13 = <5,u,u,6,u,u,7,u> -; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm13, %ymm3 -; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6],ymm13[7] -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm13 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm3, %ymm13, %ymm3 -; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-ALL-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX2-FAST-ALL-NEXT: vpermd %ymm15, %ymm13, %ymm13 +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm13, %ymm15, %ymm7 +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-ALL-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm8, %ymm9, %ymm8 ; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7] -; AVX2-FAST-ALL-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm4[0],ymm15[1],ymm4[1],ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[8],ymm4[8],ymm15[9],ymm4[9],ymm15[10],ymm4[10],ymm15[11],ymm4[11] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5,6],ymm8[7] +; AVX2-FAST-ALL-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6],ymm1[7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm7, 160(%rax) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, (%rax) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm14, 64(%rax) ; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, 128(%rax) @@ -1198,119 +1194,112 @@ ; ; AVX2-FAST-PERLANE-LABEL: vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm10, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm15, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm14[0],ymm3[1],ymm14[1],ymm3[2],ymm14[2],ymm3[3],ymm14[3],ymm3[8],ymm14[8],ymm3[9],ymm14[9],ymm3[10],ymm14[10],ymm3[11],ymm14[11] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm14 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm15 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm10 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm11 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm12[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm13, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm14 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm15 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm14 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm4, %ymm12, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2],ymm8[3,4],ymm12[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm15[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm8, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4],ymm14[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3],ymm14[4],ymm8[5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm14, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,2,2,5,4,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -1365,367 +1354,361 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $296, %rsp # imm = 0x128 -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: subq $280, %rsp # imm = 0x118 +; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm13 ; SSE-NEXT: movdqa (%rsi), %xmm2 -; SSE-NEXT: movdqa 16(%rsi), %xmm14 -; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm9 -; SSE-NEXT: movdqa (%rcx), %xmm7 -; SSE-NEXT: movdqa 16(%rcx), %xmm15 -; SSE-NEXT: movdqa (%r8), %xmm5 -; SSE-NEXT: movdqa (%r9), %xmm12 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm4[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm4[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm10, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm12[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: andnps %xmm6, %xmm4 -; SSE-NEXT: orps %xmm3, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa 16(%rsi), %xmm9 +; SSE-NEXT: movdqa (%rdx), %xmm12 +; SSE-NEXT: movdqa 16(%rdx), %xmm14 +; SSE-NEXT: movdqa (%rcx), %xmm4 +; SSE-NEXT: movdqa 16(%rcx), %xmm10 +; SSE-NEXT: movdqa (%r8), %xmm7 +; SSE-NEXT: movdqa (%r9), %xmm11 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm3[2,3] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm3[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] +; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: andnps %xmm11, %xmm0 +; SSE-NEXT: orps %xmm8, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm12[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm4[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: andnps %xmm3, %xmm4 -; SSE-NEXT: orps %xmm2, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: andps %xmm6, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: andnps %xmm4, %xmm0 +; SSE-NEXT: orps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 16(%r8), %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm6[0,1] -; SSE-NEXT: movdqa 16(%r9), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm0[3,3] +; SSE-NEXT: movdqa 16(%r8), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: andnps %xmm6, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3] -; SSE-NEXT: andps %xmm10, %xmm3 -; SSE-NEXT: orps %xmm3, %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm9[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm6[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: andnps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3] -; SSE-NEXT: andps %xmm10, %xmm3 -; SSE-NEXT: orps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm8[0,1] +; SSE-NEXT: movdqa 16(%r9), %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: andnps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] +; SSE-NEXT: andps %xmm6, %xmm11 +; SSE-NEXT: orps %xmm11, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdx), %xmm2 -; SSE-NEXT: movdqa 32(%rcx), %xmm15 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; SSE-NEXT: movdqa %xmm14, (%rsp) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm14[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,2],xmm10[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: andnps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] +; SSE-NEXT: andps %xmm6, %xmm9 +; SSE-NEXT: orps %xmm9, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm1 +; SSE-NEXT: movdqa 32(%rcx), %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa 32(%rsi), %xmm14 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 32(%r8), %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm0[0,1] -; SSE-NEXT: movdqa 32(%r9), %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3] -; SSE-NEXT: andps %xmm10, %xmm3 -; SSE-NEXT: orps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; SSE-NEXT: movdqa 32(%rdi), %xmm12 +; SSE-NEXT: movdqa 32(%rsi), %xmm10 +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm0[3,3] +; SSE-NEXT: movdqa 32(%r8), %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm13[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[0,1] +; SSE-NEXT: movdqa 32(%r9), %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm3[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: andnps %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: andnps %xmm0, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] +; SSE-NEXT: andps %xmm6, %xmm15 +; SSE-NEXT: orps %xmm15, %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm13[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm9[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: andnps %xmm9, %xmm10 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: andps %xmm6, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdx), %xmm0 -; SSE-NEXT: movdqa 48(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rcx), %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm14 +; SSE-NEXT: movdqa 48(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm1[3,3] -; SSE-NEXT: movdqa 48(%r8), %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm9[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm13[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm3[3,3] +; SSE-NEXT: movdqa 48(%r8), %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm10[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[0,1] ; SSE-NEXT: movdqa 48(%r9), %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: andnps %xmm7, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: andnps %xmm14, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] -; SSE-NEXT: andps %xmm10, %xmm15 -; SSE-NEXT: orps %xmm15, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE-NEXT: andps %xmm6, %xmm15 +; SSE-NEXT: orps %xmm15, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm7[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm9[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm13[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; SSE-NEXT: andnps %xmm7, %xmm10 -; SSE-NEXT: orps %xmm2, %xmm10 +; SSE-NEXT: andps %xmm6, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; SSE-NEXT: andnps %xmm9, %xmm6 +; SSE-NEXT: orps %xmm2, %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: andps %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm8[0,2] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[0,0,1,1] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm7[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm5[0,2] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: andps %xmm5, %xmm14 +; SSE-NEXT: por %xmm14, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm12 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: andps %xmm14, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[0,2] -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,1],xmm0[1,3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm14[0,2] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: andps %xmm9, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm11[0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm15[1] +; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,2] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: andps %xmm5, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[0,2] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[0,2] -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: andps %xmm15, %xmm4 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm1[0,2] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: andps %xmm9, %xmm6 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,0,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: andps %xmm5, %xmm6 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: andps %xmm9, %xmm7 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm14, %xmm7 +; SSE-NEXT: andps %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm11[0] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm11[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[0,2] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: andps %xmm9, %xmm8 +; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm13[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm11[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm11, %xmm8 +; SSE-NEXT: andps %xmm5, %xmm14 +; SSE-NEXT: por %xmm14, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm12[0] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm12[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm11[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: andps %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[0,2] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: andps %xmm15, %xmm6 -; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: andps %xmm9, %xmm14 +; SSE-NEXT: por %xmm14, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm1[1] +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm12[0,2] +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: pandn %xmm15, %xmm12 +; SSE-NEXT: andps %xmm5, %xmm13 +; SSE-NEXT: por %xmm13, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm14 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm6[0,2] -; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: pandn %xmm6, %xmm13 -; SSE-NEXT: andps %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm13[0,2] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm15[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: pandn %xmm15, %xmm13 +; SSE-NEXT: andps %xmm9, %xmm14 +; SSE-NEXT: por %xmm14, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm10[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: andps %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: andps %xmm5, %xmm15 +; SSE-NEXT: por %xmm15, %xmm14 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2] -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[0,2] +; SSE-NEXT: andps %xmm9, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm4 -; SSE-NEXT: pandn %xmm4, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[0,2] -; SSE-NEXT: andps %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm5, %xmm15 -; SSE-NEXT: por %xmm9, %xmm15 +; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[0,2] +; SSE-NEXT: andps %xmm5, %xmm10 +; SSE-NEXT: pandn %xmm15, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm15, 352(%rax) -; SSE-NEXT: movdqa %xmm14, 336(%rax) -; SSE-NEXT: movdqa %xmm6, 304(%rax) +; SSE-NEXT: movdqa %xmm5, 352(%rax) +; SSE-NEXT: movdqa %xmm9, 336(%rax) +; SSE-NEXT: movdqa %xmm14, 304(%rax) ; SSE-NEXT: movdqa %xmm13, 288(%rax) ; SSE-NEXT: movdqa %xmm12, 256(%rax) -; SSE-NEXT: movdqa %xmm3, 240(%rax) -; SSE-NEXT: movdqa %xmm11, 208(%rax) -; SSE-NEXT: movdqa %xmm7, 192(%rax) -; SSE-NEXT: movdqa %xmm8, 160(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movdqa %xmm11, 240(%rax) +; SSE-NEXT: movdqa %xmm8, 208(%rax) +; SSE-NEXT: movdqa %xmm3, 192(%rax) +; SSE-NEXT: movdqa %xmm7, 160(%rax) +; SSE-NEXT: movdqa %xmm6, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1738,7 +1721,8 @@ ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: movaps %xmm10, 368(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 368(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1753,154 +1737,152 @@ ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $296, %rsp # imm = 0x128 +; SSE-NEXT: addq $280, %rsp # imm = 0x118 ; SSE-NEXT: retq ; ; AVX1-LABEL: vf32: ; AVX1: # %bb.0: -; AVX1-NEXT: subq $184, %rsp +; AVX1-NEXT: subq $120, %rsp ; AVX1-NEXT: vmovdqa 32(%rcx), %xmm8 ; AVX1-NEXT: vmovdqa 48(%rcx), %xmm0 -; AVX1-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX1-NEXT: vmovdqa 32(%rdx), %xmm9 ; AVX1-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm2 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[2,3,2,3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7] -; AVX1-NEXT: vmovdqa 48(%r8), %xmm6 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm7[1,2],xmm0[3] -; AVX1-NEXT: vmovdqa 48(%r9), %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6],xmm4[7] -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5],xmm3[6,7] -; AVX1-NEXT: vpslld $16, %xmm0, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5],xmm3[6,7] -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX1-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX1-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,6,5,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5],xmm4[6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6],xmm4[7] -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm4, %ymm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm3[0,1],xmm6[0],xmm3[3] -; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX1-NEXT: vmovdqa 48(%r8), %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm4[1,2],xmm2[3] +; AVX1-NEXT: vmovdqa 48(%r9), %xmm2 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm12[1],xmm5[2,3,4,5,6],xmm12[7] +; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7] +; AVX1-NEXT: vpslld $16, %xmm2, %xmm5 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5],xmm4[6,7] ; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3],xmm3[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7] -; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm3[0,1],xmm7[0],xmm3[3] -; AVX1-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5],xmm5[6,7] -; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] ; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,6,5,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5],xmm3[6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5,6],xmm3[7] +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX1-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm4[1],xmm12[2,3] +; AVX1-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7] +; AVX1-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5],xmm13[6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6],xmm13[7] +; AVX1-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm7 +; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] +; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = xmm6[0,1],xmm0[0],xmm6[3] +; AVX1-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5],xmm7[6,7] +; AVX1-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3],xmm6[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7] +; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = xmm2[0,1],xmm1[0],xmm2[3] +; AVX1-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5],xmm7[6,7] +; AVX1-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3],xmm2[4,5,6,7] +; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,1,3,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm0[1,2],xmm3[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6],xmm4[7] -; AVX1-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6],xmm5[7] +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovdqa 16(%rdx), %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4,5],xmm0[6,7] -; AVX1-NEXT: vpslld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; AVX1-NEXT: vpslld $16, %xmm4, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm2 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] @@ -1935,17 +1917,16 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rcx), %xmm14 -; AVX1-NEXT: vmovdqa (%rdx), %xmm11 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,2,2] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vmovdqa (%rcx), %xmm9 +; AVX1-NEXT: vmovdqa (%rdx), %xmm8 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovdqa (%rsi), %xmm9 -; AVX1-NEXT: vmovdqa (%rdi), %xmm8 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-NEXT: vmovdqa (%rsi), %xmm6 +; AVX1-NEXT: vmovdqa (%rdi), %xmm5 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -1953,61 +1934,58 @@ ; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm0[1],xmm12[2,3] ; AVX1-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1,2],xmm15[3],xmm13[4,5,6,7] -; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7] ; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,6,5,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm12[2,3,4,5],xmm7[6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0],xmm6[1],xmm7[2,3,4,5,6],xmm6[7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm5[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm7 -; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = xmm6[0,1],xmm3[0],xmm6[3] -; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1,2,3,4],xmm5[5],xmm7[6,7] -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm5 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3,4,5],xmm14[6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5,6],xmm14[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm14, %ymm11 +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 +; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7] +; AVX1-NEXT: vinsertps {{.*#+}} xmm10 = xmm11[0,1],xmm3[0],xmm11[3] +; AVX1-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm14[5],xmm10[6,7] +; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3],xmm11[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm8 -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6],ymm5[7] -; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0,1],xmm1[0],xmm5[3] -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3,4],xmm2[5],xmm7[6,7] -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3],xmm5[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7] +; AVX1-NEXT: vinsertps {{.*#+}} xmm8 = xmm6[0,1],xmm1[0],xmm6[3] +; AVX1-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7] +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm9[3],xmm6[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm3[1,2],xmm4[3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5,6],xmm6[7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7] ; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 @@ -2015,15 +1993,13 @@ ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-NEXT: vmovdqa %xmm0, 48(%rax) ; AVX1-NEXT: vmovdqa %xmm4, 32(%rax) -; AVX1-NEXT: vmovdqa %xmm5, 16(%rax) -; AVX1-NEXT: vmovdqa %xmm2, (%rax) -; AVX1-NEXT: vmovdqa %xmm12, 112(%rax) -; AVX1-NEXT: vmovdqa %xmm15, 96(%rax) -; AVX1-NEXT: vmovdqa %xmm13, 80(%rax) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm0, 64(%rax) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm0, 176(%rax) +; AVX1-NEXT: vmovdqa %xmm6, 16(%rax) +; AVX1-NEXT: vmovdqa %xmm8, (%rax) +; AVX1-NEXT: vmovdqa %xmm2, 112(%rax) +; AVX1-NEXT: vmovdqa %xmm10, 96(%rax) +; AVX1-NEXT: vmovdqa %xmm12, 80(%rax) +; AVX1-NEXT: vmovdqa %xmm13, 64(%rax) +; AVX1-NEXT: vmovdqa %xmm15, 176(%rax) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm0, 160(%rax) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2032,7 +2008,7 @@ ; AVX1-NEXT: vmovaps %xmm0, 128(%rax) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm0, 240(%rax) -; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm0, 224(%rax) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm0, 208(%rax) @@ -2040,7 +2016,7 @@ ; AVX1-NEXT: vmovaps %xmm0, 192(%rax) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm0, 304(%rax) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm0, 288(%rax) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm0, 272(%rax) @@ -2054,31 +2030,31 @@ ; AVX1-NEXT: vmovaps %xmm0, 336(%rax) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm0, 320(%rax) -; AVX1-NEXT: addq $184, %rsp +; AVX1-NEXT: addq $120, %rsp ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -2086,7 +2062,7 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] @@ -2095,42 +2071,43 @@ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm9, %xmm11 -; AVX2-SLOW-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm10 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm8 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 @@ -2155,14 +2132,13 @@ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm14 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm15 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,1,2,3,6,5,6,7] @@ -2183,60 +2159,59 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm13, %ymm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm5, %ymm12, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm13 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,3,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm13, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm12, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2],ymm12[3,4],ymm0[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[2,3,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm10 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm7, %ymm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -2248,86 +2223,86 @@ ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm3 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm14[4],ymm5[4],ymm14[5],ymm5[5],ymm14[6],ymm5[6],ymm14[7],ymm5[7],ymm14[12],ymm5[12],ymm14[13],ymm5[13],ymm14[14],ymm5[14],ymm14[15],ymm5[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm2, %ymm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm7, %ymm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm12, %ymm15 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-SLOW-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = mem[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,0,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = mem[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm12, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm7, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[1,0,2,2,5,4,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2,3],ymm14[4],ymm7[5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,0,2,2,5,4,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm11, %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[0],ymm6[1],mem[1],ymm6[2],mem[2],ymm6[3],mem[3],ymm6[8],mem[8],ymm6[9],mem[9],ymm6[10],mem[10],ymm6[11],mem[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,0,2,2,5,4,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,2,2,5,4,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm15, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 288(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 352(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) @@ -2350,25 +2325,26 @@ ; ; AVX2-FAST-ALL-LABEL: vf32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: subq $648, %rsp # imm = 0x288 +; AVX2-FAST-ALL-NEXT: subq $616, %rsp # imm = 0x268 ; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-ALL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FAST-ALL-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-ALL-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm2 -; AVX2-FAST-ALL-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, %xmm6 +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %xmm9 ; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm2, %ymm2 @@ -2383,50 +2359,52 @@ ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpshufb %xmm8, %xmm0, %xmm1 -; AVX2-FAST-ALL-NEXT: vmovdqa %xmm0, %xmm13 -; AVX2-FAST-ALL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-ALL-NEXT: vpshufb %xmm8, %xmm11, %xmm0 -; AVX2-FAST-ALL-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm4, %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm4, %xmm7 +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX2-FAST-ALL-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm5, %xmm10 +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-ALL-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FAST-ALL-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-ALL-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm8, %xmm13 +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm9, %xmm15 +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r8), %xmm3 ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, %xmm8 +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, %xmm11 ; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r9), %xmm3 ; AVX2-FAST-ALL-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, %xmm7 +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, %xmm8 ; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rsi), %ymm2 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm3, %ymm2 ; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %ymm9 -; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm2 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm3 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm3 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] @@ -2441,7 +2419,7 @@ ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-ALL-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2451,11 +2429,11 @@ ; AVX2-FAST-ALL-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %ymm3 ; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm5 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] @@ -2469,48 +2447,48 @@ ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vpshufb %ymm4, %ymm0, %ymm4 ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm5, %ymm4, %ymm0 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm12, %ymm5, %ymm4, %ymm0 ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <1,2,1,2,u,u,3,3> ; AVX2-FAST-ALL-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm15 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm15, %xmm8, %xmm13 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm15, %xmm11, %xmm13 ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5,6],ymm13[7] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm7, %xmm4 +; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm8, %xmm4 ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-ALL-NEXT: vpblendvb %ymm4, %ymm13, %ymm0, %ymm0 ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-ALL-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-ALL-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-ALL-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-ALL-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-ALL-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-ALL-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-ALL-NEXT: vpshufb %xmm15, %xmm2, %xmm5 ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-ALL-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15] +; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,5,6,5,6,7,7] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, %ymm11 ; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm13 = @@ -2519,16 +2497,17 @@ ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm6, %ymm5 +; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vpshufb %ymm15, %ymm8, %ymm5 ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-FAST-ALL-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] +; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -2563,36 +2542,35 @@ ; AVX2-FAST-ALL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-ALL-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-FAST-ALL-NEXT: vpermd %ymm4, %ymm13, %ymm4 -; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-ALL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-ALL-NEXT: # xmm13 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-FAST-ALL-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm4[1],ymm13[2,3],ymm4[4],ymm13[5,6],ymm4[7] -; AVX2-FAST-ALL-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-ALL-NEXT: # xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,0,2,1,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-ALL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-ALL-NEXT: # ymm5 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11] +; AVX2-FAST-ALL-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-FAST-ALL-NEXT: # xmm13 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm13, %ymm13 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2],ymm4[3,4],ymm13[5],ymm4[6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-FAST-ALL-NEXT: # xmm13 = mem[0,0,2,1,4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm13, %ymm13 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm15, %ymm4, %ymm13, %ymm4 +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] ; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm14 = [5,4,2,2,5,4,6,6] -; AVX2-FAST-ALL-NEXT: vpermd %ymm5, %ymm14, %ymm5 +; AVX2-FAST-ALL-NEXT: vpermd %ymm12, %ymm14, %ymm12 ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6],ymm5[7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm12 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2],ymm5[3,4],ymm12[5],ymm5[6,7] -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm15, %ymm5, %ymm11, %ymm5 -; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm10 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm10 ; AVX2-FAST-ALL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-ALL-NEXT: # ymm6 = ymm6[0],mem[0],ymm6[1],mem[1],ymm6[2],mem[2],ymm6[3],mem[3],ymm6[8],mem[8],ymm6[9],mem[9],ymm6[10],mem[10],ymm6[11],mem[11] ; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm14, %ymm6 -; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] ; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] @@ -2604,7 +2582,7 @@ ; AVX2-FAST-ALL-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, 288(%rax) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm10, 288(%rax) ; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, (%rax) @@ -2621,22 +2599,22 @@ ; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-ALL-NEXT: addq $648, %rsp # imm = 0x288 +; AVX2-FAST-ALL-NEXT: addq $616, %rsp # imm = 0x268 ; AVX2-FAST-ALL-NEXT: vzeroupper ; AVX2-FAST-ALL-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $664, %rsp # imm = 0x298 +; AVX2-FAST-PERLANE-NEXT: subq $696, %rsp # imm = 0x2B8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm2 @@ -2644,14 +2622,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] @@ -2660,18 +2638,19 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2685,22 +2664,21 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] @@ -2715,7 +2693,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2743,137 +2721,136 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm11, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm11, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm8, %ymm13, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm14[2],ymm7[3,4],ymm14[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm12[4],ymm15[4],ymm12[5],ymm15[5],ymm12[6],ymm15[6],ymm12[7],ymm15[7],ymm12[12],ymm15[12],ymm12[13],ymm15[13],ymm12[14],ymm15[14],ymm12[15],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm8, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4],ymm12[4],ymm0[5],ymm12[5],ymm0[6],ymm12[6],ymm0[7],ymm12[7],ymm0[12],ymm12[12],ymm0[13],ymm12[13],ymm0[14],ymm12[14],ymm0[15],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm15[4],ymm3[5],ymm15[5],ymm3[6],ymm15[6],ymm3[7],ymm15[7],ymm3[12],ymm15[12],ymm3[13],ymm15[13],ymm3[14],ymm15[14],ymm3[15],ymm15[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm11, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm5, %ymm15, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm15, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm13, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm11, %ymm8, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[0,0,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm7, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[0,0,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm11, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm4[0],ymm12[0],ymm4[1],ymm12[1],ymm4[2],ymm12[2],ymm4[3],ymm12[3],ymm4[8],ymm12[8],ymm4[9],ymm12[9],ymm4[10],ymm12[10],ymm4[11],ymm12[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3],ymm12[4],ymm7[5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm12 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm9, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm5[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm10, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] @@ -2881,30 +2858,32 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $664, %rsp # imm = 0x298 +; AVX2-FAST-PERLANE-NEXT: addq $696, %rsp # imm = 0x2B8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll @@ -180,9 +180,9 @@ ; SSE-NEXT: movaps (%rsi), %xmm4 ; SSE-NEXT: movaps 16(%rsi), %xmm5 ; SSE-NEXT: movaps 32(%rsi), %xmm6 -; SSE-NEXT: movaps 48(%rsi), %xmm8 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps 48(%rsi), %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] @@ -191,8 +191,8 @@ ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] ; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] ; SSE-NEXT: movaps %xmm3, 96(%rdx) ; SSE-NEXT: movaps %xmm6, 112(%rdx) ; SSE-NEXT: movaps %xmm2, 64(%rdx) @@ -200,34 +200,34 @@ ; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps %xmm4, 48(%rdx) ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm7, 16(%rdx) +; SSE-NEXT: movaps %xmm8, 16(%rdx) ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i32_stride2_vf16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovaps (%rsi), %xmm0 -; AVX1-NEXT: vmovaps 16(%rsi), %xmm8 +; AVX1-NEXT: vmovaps 16(%rsi), %xmm1 ; AVX1-NEXT: vmovaps 32(%rsi), %xmm2 ; AVX1-NEXT: vmovaps 48(%rsi), %xmm3 ; AVX1-NEXT: vmovaps (%rdi), %xmm4 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm5 ; AVX1-NEXT: vmovaps 32(%rdi), %xmm6 ; AVX1-NEXT: vmovaps 48(%rdi), %xmm7 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm8 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX1-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX1-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -286,64 +286,62 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride2_vf32: ; SSE: # %bb.0: -; SSE-NEXT: movaps 112(%rdi), %xmm4 +; SSE-NEXT: movaps 112(%rdi), %xmm0 ; SSE-NEXT: movaps 96(%rdi), %xmm6 -; SSE-NEXT: movaps 80(%rdi), %xmm8 -; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps 16(%rdi), %xmm14 -; SSE-NEXT: movaps 32(%rdi), %xmm15 +; SSE-NEXT: movaps 80(%rdi), %xmm4 +; SSE-NEXT: movaps 64(%rdi), %xmm3 +; SSE-NEXT: movaps (%rdi), %xmm8 +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 48(%rdi), %xmm5 -; SSE-NEXT: movaps 96(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rsi), %xmm11 ; SSE-NEXT: movaps 80(%rsi), %xmm12 ; SSE-NEXT: movaps 64(%rsi), %xmm13 -; SSE-NEXT: movaps (%rsi), %xmm2 -; SSE-NEXT: movaps 16(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rsi), %xmm0 -; SSE-NEXT: movaps 48(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE-NEXT: movaps (%rsi), %xmm9 +; SSE-NEXT: movaps 16(%rsi), %xmm10 +; SSE-NEXT: movaps 32(%rsi), %xmm14 +; SSE-NEXT: movaps 48(%rsi), %xmm15 +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1] -; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1] +; SSE-NEXT: movaps %xmm3, %xmm15 +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] +; SSE-NEXT: movaps %xmm4, %xmm13 ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: movaps 112(%rsi), %xmm12 -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; SSE-NEXT: movaps %xmm4, 224(%rdx) +; SSE-NEXT: movaps %xmm6, %xmm12 +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] +; SSE-NEXT: movaps 112(%rsi), %xmm11 +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movaps %xmm0, 224(%rdx) ; SSE-NEXT: movaps %xmm7, 240(%rdx) ; SSE-NEXT: movaps %xmm6, 192(%rdx) -; SSE-NEXT: movaps %xmm3, 208(%rdx) -; SSE-NEXT: movaps %xmm8, 160(%rdx) +; SSE-NEXT: movaps %xmm12, 208(%rdx) +; SSE-NEXT: movaps %xmm4, 160(%rdx) ; SSE-NEXT: movaps %xmm13, 176(%rdx) -; SSE-NEXT: movaps %xmm9, 128(%rdx) -; SSE-NEXT: movaps %xmm1, 144(%rdx) +; SSE-NEXT: movaps %xmm3, 128(%rdx) +; SSE-NEXT: movaps %xmm15, 144(%rdx) ; SSE-NEXT: movaps %xmm5, 96(%rdx) -; SSE-NEXT: movaps %xmm0, 112(%rdx) -; SSE-NEXT: movaps %xmm15, 64(%rdx) -; SSE-NEXT: movaps %xmm2, 80(%rdx) -; SSE-NEXT: movaps %xmm14, 32(%rdx) -; SSE-NEXT: movaps %xmm10, 48(%rdx) -; SSE-NEXT: movaps %xmm11, (%rdx) +; SSE-NEXT: movaps %xmm14, 112(%rdx) +; SSE-NEXT: movaps %xmm2, 64(%rdx) +; SSE-NEXT: movaps %xmm10, 80(%rdx) +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps %xmm9, 48(%rdx) +; SSE-NEXT: movaps %xmm8, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: retq @@ -354,50 +352,50 @@ ; AVX1-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm8 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX1-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm9 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps 80(%rsi), %xmm2 ; AVX1-NEXT: vmovaps 80(%rdi), %xmm3 ; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm10 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-NEXT: vmovaps (%rsi), %xmm3 ; AVX1-NEXT: vmovaps 16(%rsi), %xmm4 -; AVX1-NEXT: vmovaps 32(%rsi), %xmm11 +; AVX1-NEXT: vmovaps 32(%rsi), %xmm5 ; AVX1-NEXT: vmovaps 48(%rsi), %xmm6 ; AVX1-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-NEXT: vmovaps 16(%rdi), %xmm8 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm10 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm11 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm8 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps 112(%rsi), %xmm4 -; AVX1-NEXT: vmovaps 112(%rdi), %xmm5 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-NEXT: vmovaps %ymm4, 224(%rdx) -; AVX1-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX1-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-NEXT: vmovaps 112(%rsi), %xmm7 +; AVX1-NEXT: vmovaps 112(%rdi), %xmm8 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm9 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-NEXT: vmovaps %ymm7, 224(%rdx) +; AVX1-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX1-NEXT: vmovaps %ymm6, 96(%rdx) ; AVX1-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-NEXT: vmovaps %ymm10, 160(%rdx) -; AVX1-NEXT: vmovaps %ymm9, 128(%rdx) -; AVX1-NEXT: vmovaps %ymm8, 192(%rdx) +; AVX1-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX1-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX1-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -407,50 +405,50 @@ ; AVX2-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-NEXT: vmovaps 80(%rsi), %xmm3 ; AVX2-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm10 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm11 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-NEXT: vmovaps (%rsi), %xmm4 ; AVX2-NEXT: vmovaps 16(%rsi), %xmm5 ; AVX2-NEXT: vmovaps 32(%rsi), %xmm6 ; AVX2-NEXT: vmovaps 48(%rsi), %xmm7 -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX2-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX2-NEXT: vmovaps 48(%rdi), %xmm3 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm12 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm13 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm15 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX2-NEXT: vmovaps 112(%rsi), %xmm6 -; AVX2-NEXT: vmovaps 112(%rdi), %xmm1 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX2-NEXT: vmovaps 96(%rsi), %xmm6 -; AVX2-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; AVX2-NEXT: vmovaps %xmm0, 192(%rdx) -; AVX2-NEXT: vmovaps %xmm1, 208(%rdx) -; AVX2-NEXT: vmovaps %xmm2, 224(%rdx) -; AVX2-NEXT: vmovaps %xmm5, 240(%rdx) -; AVX2-NEXT: vmovaps %xmm3, 64(%rdx) -; AVX2-NEXT: vmovaps %xmm7, 80(%rdx) -; AVX2-NEXT: vmovaps %xmm4, 96(%rdx) +; AVX2-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-NEXT: vmovaps 16(%rdi), %xmm9 +; AVX2-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX2-NEXT: vmovaps 48(%rdi), %xmm11 +; AVX2-NEXT: vunpckhps {{.*#+}} xmm12 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm9 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm11 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; AVX2-NEXT: vmovaps 112(%rsi), %xmm10 +; AVX2-NEXT: vmovaps 112(%rdi), %xmm13 +; AVX2-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] +; AVX2-NEXT: vmovaps 96(%rsi), %xmm13 +; AVX2-NEXT: vmovaps 96(%rdi), %xmm15 +; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; AVX2-NEXT: vmovaps %xmm13, 192(%rdx) +; AVX2-NEXT: vmovaps %xmm0, 208(%rdx) +; AVX2-NEXT: vmovaps %xmm10, 224(%rdx) +; AVX2-NEXT: vmovaps %xmm14, 240(%rdx) +; AVX2-NEXT: vmovaps %xmm6, 64(%rdx) +; AVX2-NEXT: vmovaps %xmm11, 80(%rdx) +; AVX2-NEXT: vmovaps %xmm7, 96(%rdx) ; AVX2-NEXT: vmovaps %xmm8, 112(%rdx) -; AVX2-NEXT: vmovaps %xmm15, (%rdx) -; AVX2-NEXT: vmovaps %xmm14, 16(%rdx) -; AVX2-NEXT: vmovaps %xmm13, 32(%rdx) +; AVX2-NEXT: vmovaps %xmm4, (%rdx) +; AVX2-NEXT: vmovaps %xmm9, 16(%rdx) +; AVX2-NEXT: vmovaps %xmm5, 32(%rdx) ; AVX2-NEXT: vmovaps %xmm12, 48(%rdx) -; AVX2-NEXT: vmovaps %xmm11, 160(%rdx) -; AVX2-NEXT: vmovaps %xmm10, 176(%rdx) -; AVX2-NEXT: vmovaps %xmm9, 128(%rdx) +; AVX2-NEXT: vmovaps %xmm3, 160(%rdx) +; AVX2-NEXT: vmovaps %xmm2, 176(%rdx) +; AVX2-NEXT: vmovaps %xmm1, 128(%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm0, 144(%rdx) ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -206,41 +206,41 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm6 +; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps (%rsi), %xmm3 ; SSE-NEXT: movaps 16(%rsi), %xmm5 -; SSE-NEXT: movaps (%rdx), %xmm8 -; SSE-NEXT: movaps 16(%rdx), %xmm9 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: movaps 16(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] ; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm9[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm9[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,0] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm8[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm9[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm4[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,0] +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm8[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: movaps %xmm9, (%rcx) ; SSE-NEXT: movaps %xmm3, 16(%rcx) -; SSE-NEXT: movaps %xmm2, 48(%rcx) +; SSE-NEXT: movaps %xmm8, 48(%rcx) ; SSE-NEXT: movaps %xmm5, 64(%rcx) -; SSE-NEXT: movaps %xmm6, 32(%rcx) +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps %xmm1, 80(%rcx) ; SSE-NEXT: retq ; @@ -391,139 +391,140 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride3_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm8 +; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm5 -; SSE-NEXT: movaps 48(%rdi), %xmm15 -; SSE-NEXT: movaps (%rsi), %xmm11 -; SSE-NEXT: movaps 16(%rsi), %xmm12 -; SSE-NEXT: movaps 32(%rsi), %xmm14 -; SSE-NEXT: movaps 48(%rsi), %xmm4 -; SSE-NEXT: movaps 16(%rdx), %xmm10 +; SSE-NEXT: movaps 32(%rdi), %xmm1 +; SSE-NEXT: movaps 48(%rdi), %xmm8 +; SSE-NEXT: movaps (%rsi), %xmm5 +; SSE-NEXT: movaps 16(%rsi), %xmm9 +; SSE-NEXT: movaps 32(%rsi), %xmm10 +; SSE-NEXT: movaps 48(%rsi), %xmm11 +; SSE-NEXT: movaps 16(%rdx), %xmm0 ; SSE-NEXT: movaps 32(%rdx), %xmm3 -; SSE-NEXT: movaps 48(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm15, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] -; SSE-NEXT: movaps %xmm15, %xmm6 -; SSE-NEXT: movaps %xmm15, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm4[3,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm13[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm6[2,0] -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm14[1] -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm14[3,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm12[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm10[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm10[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,0] -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] +; SSE-NEXT: movaps 48(%rdx), %xmm7 +; SSE-NEXT: movaps %xmm8, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] +; SSE-NEXT: movaps %xmm8, %xmm13 ; SSE-NEXT: movaps %xmm8, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] -; SSE-NEXT: movaps %xmm8, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm11[3,3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm7[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,0] +; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] +; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm10[3,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm12[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm3[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm13[2,0] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm9[3,3] +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm12[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm0[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2,0] +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] +; SSE-NEXT: movaps %xmm4, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm5[3,3] ; SSE-NEXT: movaps (%rdx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm12[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,0] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm0[2,3] -; SSE-NEXT: movaps %xmm6, (%rcx) -; SSE-NEXT: movaps %xmm11, 16(%rcx) -; SSE-NEXT: movaps %xmm7, 48(%rcx) -; SSE-NEXT: movaps %xmm12, 64(%rcx) -; SSE-NEXT: movaps %xmm13, 96(%rcx) -; SSE-NEXT: movaps %xmm14, 112(%rcx) -; SSE-NEXT: movaps %xmm9, 144(%rcx) -; SSE-NEXT: movaps %xmm4, 160(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] -; SSE-NEXT: movaps %xmm8, 32(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm3[2,0] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[1,2],mem[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm7[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm0[2,3] +; SSE-NEXT: movaps %xmm13, (%rcx) +; SSE-NEXT: movaps %xmm5, 16(%rcx) +; SSE-NEXT: movaps %xmm15, 48(%rcx) +; SSE-NEXT: movaps %xmm9, 64(%rcx) +; SSE-NEXT: movaps %xmm14, 96(%rcx) +; SSE-NEXT: movaps %xmm10, 112(%rcx) +; SSE-NEXT: movaps %xmm6, 144(%rcx) +; SSE-NEXT: movaps %xmm11, 160(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] +; SSE-NEXT: movaps %xmm4, 32(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] ; SSE-NEXT: movaps %xmm2, 80(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] -; SSE-NEXT: movaps %xmm5, 128(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] -; SSE-NEXT: movaps %xmm15, 176(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] +; SSE-NEXT: movaps %xmm1, 128(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] +; SSE-NEXT: movaps %xmm8, 176(%rcx) ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i32_stride3_vf16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdx), %ymm8 -; AVX1-NEXT: vmovapd 32(%rdx), %ymm9 +; AVX1-NEXT: vmovapd (%rdx), %ymm0 +; AVX1-NEXT: vmovapd 32(%rdx), %ymm1 ; AVX1-NEXT: vmovaps (%rsi), %xmm2 ; AVX1-NEXT: vmovaps 16(%rsi), %xmm3 -; AVX1-NEXT: vmovapd 32(%rsi), %xmm4 +; AVX1-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX1-NEXT: vmovaps 48(%rsi), %xmm5 ; AVX1-NEXT: vmovaps (%rdi), %xmm6 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX1-NEXT: vmovapd 32(%rdi), %xmm0 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm2[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,1],xmm1[0,2] +; AVX1-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm6[1],xmm2[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm9 = xmm2[1,1],xmm9[0,2] ; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vbroadcastsd (%rdx), %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX1-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3],xmm5[3,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm9[2,3,2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,3,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6],ymm5[7] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm0[1],xmm4[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[1,1],xmm5[0,2] -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,0],xmm0[2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd 32(%rdx), %ymm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm7[3,3],xmm3[3,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm3[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm8[2,3,2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm4[0,0,3,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = mem[0,0,3,3,4,4,7,7] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = mem[1,0,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm9[1,1,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = mem[0,0,3,3,4,4,7,7] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = mem[1,0,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = ymm8[1,1,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX1-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX1-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastsd (%rdx), %ymm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7] +; AVX1-NEXT: vmovaps 48(%rdi), %xmm6 +; AVX1-NEXT: vshufps {{.*#+}} xmm9 = xmm6[3,3],xmm5[3,3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm6[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3,2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[0,0,3,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm4[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1],xmm6[0,2] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-NEXT: vbroadcastsd 32(%rdx), %ymm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] +; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm3[3,3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm7[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[0,0,3,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = mem[0,0,3,3,4,4,7,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm7 = mem[1,0,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5],ymm1[6],ymm6[7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = mem[0,0,3,3,4,4,7,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm7 = mem[1,0,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7] +; AVX1-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-NEXT: vmovaps %ymm1, 128(%rcx) ; AVX1-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-NEXT: vmovaps %ymm0, 96(%rcx) -; AVX1-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX1-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-NEXT: vmovaps %ymm5, 160(%rcx) ; AVX1-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -719,99 +720,99 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $168, %rsp +; SSE-NEXT: subq $152, %rsp ; SSE-NEXT: movaps (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm10 -; SSE-NEXT: movaps 32(%rdi), %xmm9 -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps (%rsi), %xmm4 -; SSE-NEXT: movaps 16(%rsi), %xmm5 -; SSE-NEXT: movaps 32(%rsi), %xmm14 -; SSE-NEXT: movaps 48(%rsi), %xmm13 -; SSE-NEXT: movaps (%rdx), %xmm6 -; SSE-NEXT: movaps 16(%rdx), %xmm7 -; SSE-NEXT: movaps 32(%rdx), %xmm3 -; SSE-NEXT: movaps 48(%rdx), %xmm2 +; SSE-NEXT: movaps 16(%rdi), %xmm2 +; SSE-NEXT: movaps 32(%rdi), %xmm3 +; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps (%rsi), %xmm12 +; SSE-NEXT: movaps 16(%rsi), %xmm11 +; SSE-NEXT: movaps 32(%rsi), %xmm10 +; SSE-NEXT: movaps 48(%rsi), %xmm9 +; SSE-NEXT: movaps (%rdx), %xmm5 +; SSE-NEXT: movaps 16(%rdx), %xmm6 +; SSE-NEXT: movaps 32(%rdx), %xmm7 +; SSE-NEXT: movaps 48(%rdx), %xmm8 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[0,3] -; SSE-NEXT: movaps %xmm6, %xmm11 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[0,3] +; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm12[3,3] ; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm11[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[0,3] -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm14[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm13[0,2] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[0,3] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm5[3,3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,3] -; SSE-NEXT: movaps %xmm9, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm14[3,3] -; SSE-NEXT: movaps %xmm9, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm11[3,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm6[1,1] +; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[0,3] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm10[3,3] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3] +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm13[3,3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm9[3,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm9 ; SSE-NEXT: movaps 64(%rdx), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps 64(%rsi), %xmm11 -; SSE-NEXT: movaps %xmm13, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm11[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] +; SSE-NEXT: movaps 64(%rsi), %xmm12 +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm12[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2] ; SSE-NEXT: movaps 80(%rdi), %xmm2 ; SSE-NEXT: movaps 80(%rdx), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,3] ; SSE-NEXT: movaps 80(%rsi), %xmm8 -; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3] @@ -832,9 +833,9 @@ ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm3[0,2] ; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps 112(%rdx), %xmm9 +; SSE-NEXT: movaps 112(%rdx), %xmm10 ; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm9[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm10[0,3] ; SSE-NEXT: movaps 112(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm3, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] @@ -842,14 +843,13 @@ ; SSE-NEXT: movaps %xmm3, %xmm5 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm10[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] ; SSE-NEXT: movaps %xmm15, %xmm5 ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[1,2],mem[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[1,2],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[2,3] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rsp), %xmm15 # 16-byte Reload ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -858,21 +858,21 @@ ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[1,2],mem[2,3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[1,2],mem[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[1,2],mem[2,3] ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[1,2],mem[2,3] ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm9[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm10[2,3] ; SSE-NEXT: movaps %xmm1, 352(%rcx) ; SSE-NEXT: movaps %xmm4, 336(%rcx) ; SSE-NEXT: movaps %xmm6, 304(%rcx) ; SSE-NEXT: movaps %xmm7, 288(%rcx) ; SSE-NEXT: movaps %xmm8, 256(%rcx) -; SSE-NEXT: movaps %xmm10, 240(%rcx) -; SSE-NEXT: movaps %xmm11, 208(%rcx) -; SSE-NEXT: movaps %xmm12, 192(%rcx) +; SSE-NEXT: movaps %xmm11, 240(%rcx) +; SSE-NEXT: movaps %xmm12, 208(%rcx) +; SSE-NEXT: movaps %xmm13, 192(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 160(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -895,8 +895,8 @@ ; SSE-NEXT: movaps %xmm0, 320(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] ; SSE-NEXT: movaps %xmm2, 272(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0,1,3] -; SSE-NEXT: movaps %xmm13, 224(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] +; SSE-NEXT: movaps %xmm9, 224(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 176(%rcx) @@ -909,119 +909,119 @@ ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,3] ; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: addq $168, %rsp +; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i32_stride3_vf32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdx), %ymm14 -; AVX1-NEXT: vmovapd 32(%rdx), %ymm8 +; AVX1-NEXT: vmovapd (%rdx), %ymm0 +; AVX1-NEXT: vmovapd 32(%rdx), %ymm3 ; AVX1-NEXT: vmovapd 64(%rdx), %ymm5 -; AVX1-NEXT: vmovapd 96(%rdx), %ymm15 -; AVX1-NEXT: vmovaps (%rsi), %xmm2 +; AVX1-NEXT: vmovapd 96(%rdx), %ymm2 +; AVX1-NEXT: vmovaps (%rsi), %xmm1 ; AVX1-NEXT: vmovaps 16(%rsi), %xmm7 ; AVX1-NEXT: vmovaps 32(%rsi), %xmm10 -; AVX1-NEXT: vmovapd 48(%rsi), %xmm0 +; AVX1-NEXT: vmovaps 48(%rsi), %xmm9 ; AVX1-NEXT: vmovaps (%rdi), %xmm4 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm8 ; AVX1-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm2[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[1,1],xmm3[0,2] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastsd (%rdx), %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX1-NEXT: vmovaps 80(%rsi), %xmm3 -; AVX1-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm4[3,3],xmm3[3,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3,2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm4[0,0,3,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX1-NEXT: vmovaps 64(%rsi), %xmm3 -; AVX1-NEXT: vmovaps 64(%rdi), %xmm4 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm4[1],xmm3[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm3[1,1],xmm6[0,2] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-NEXT: vbroadcastsd 64(%rdx), %ymm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX1-NEXT: vmovaps 48(%rdi), %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,3],xmm0[3,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm8[2,3,2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,3,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm10[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1],xmm0[0,2] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm10[0],xmm11[0] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm11[2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-NEXT: vbroadcastsd 32(%rdx), %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] -; AVX1-NEXT: vmovaps 112(%rsi), %xmm0 -; AVX1-NEXT: vmovaps 112(%rdi), %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,3],xmm0[3,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm15[2,3,2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,3,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] -; AVX1-NEXT: vmovaps 96(%rsi), %xmm0 -; AVX1-NEXT: vmovaps 96(%rdi), %xmm3 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm0[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,1],xmm4[0,2] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd 96(%rdx), %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3],xmm7[3,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1],xmm1[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm14[2,3,2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,3,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = mem[1,0,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm5[1,1,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = mem[0,0,3,3,4,4,7,7] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = mem[1,0,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm8[1,1,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = mem[0,0,3,3,4,4,7,7] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = mem[1,0,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = ymm15[1,1,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = mem[0,0,3,3,4,4,7,7] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm7 = mem[1,0,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm7 = ymm14[1,1,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] -; AVX1-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX1-NEXT: vmovaps %ymm5, 320(%rcx) -; AVX1-NEXT: vmovaps %ymm4, 128(%rcx) -; AVX1-NEXT: vmovaps %ymm3, 224(%rcx) -; AVX1-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX1-NEXT: vmovaps %ymm0, 288(%rcx) +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm4[1],xmm1[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1],xmm6[0,2] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd (%rdx), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX1-NEXT: vmovaps 80(%rsi), %xmm4 +; AVX1-NEXT: vmovaps 80(%rdi), %xmm6 +; AVX1-NEXT: vshufps {{.*#+}} xmm12 = xmm6[3,3],xmm4[3,3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,1],xmm6[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm4, %ymm4 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3,2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[0,0,3,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6],ymm6[7] +; AVX1-NEXT: vmovaps 64(%rsi), %xmm6 +; AVX1-NEXT: vmovaps 64(%rdi), %xmm12 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm12[1],xmm6[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm13 = xmm6[1,1],xmm13[0,2] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0] +; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0],xmm12[2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 +; AVX1-NEXT: vbroadcastsd 64(%rdx), %ymm12 +; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3,4],ymm12[5],ymm6[6,7] +; AVX1-NEXT: vmovaps 48(%rdi), %xmm12 +; AVX1-NEXT: vshufps {{.*#+}} xmm13 = xmm12[3,3],xmm9[3,3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm9 = xmm9[1,1],xmm12[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3,2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm12 = ymm12[0,0,3,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5,6],ymm12[7] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm11[1],xmm10[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm12 = xmm10[1,1],xmm12[0,2] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] +; AVX1-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0],xmm11[2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 +; AVX1-NEXT: vbroadcastsd 32(%rdx), %ymm11 +; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX1-NEXT: vmovaps 112(%rsi), %xmm11 +; AVX1-NEXT: vmovaps 112(%rdi), %xmm12 +; AVX1-NEXT: vshufps {{.*#+}} xmm13 = xmm12[3,3],xmm11[3,3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,1],xmm12[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm11, %ymm11 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm12 = ymm12[0,0,3,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6],ymm12[7] +; AVX1-NEXT: vmovaps 96(%rsi), %xmm12 +; AVX1-NEXT: vmovaps 96(%rdi), %xmm13 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm13[1],xmm12[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm14 = xmm12[1,1],xmm14[0,2] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0] +; AVX1-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 +; AVX1-NEXT: vbroadcastsd 96(%rdx), %ymm13 +; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7] +; AVX1-NEXT: vshufps {{.*#+}} xmm13 = xmm8[3,3],xmm7[3,3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1],xmm8[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm8 = ymm8[0,0,3,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[1,1,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[1,1,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7] +; AVX1-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-NEXT: vmovaps %ymm2, 320(%rcx) +; AVX1-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX1-NEXT: vmovaps %ymm5, 224(%rcx) +; AVX1-NEXT: vmovaps %ymm7, 64(%rcx) +; AVX1-NEXT: vmovaps %ymm12, 288(%rcx) ; AVX1-NEXT: vmovaps %ymm11, 352(%rcx) ; AVX1-NEXT: vmovaps %ymm10, 96(%rcx) ; AVX1-NEXT: vmovaps %ymm9, 160(%rcx) -; AVX1-NEXT: vmovaps %ymm13, 192(%rcx) -; AVX1-NEXT: vmovaps %ymm12, 256(%rcx) -; AVX1-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX1-NEXT: vmovaps %ymm4, 256(%rcx) +; AVX1-NEXT: vmovaps %ymm1, (%rcx) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll @@ -221,97 +221,97 @@ ; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps (%rsi), %xmm5 -; SSE-NEXT: movaps 16(%rsi), %xmm8 -; SSE-NEXT: movaps (%rdx), %xmm3 +; SSE-NEXT: movaps 16(%rsi), %xmm6 +; SSE-NEXT: movaps (%rdx), %xmm7 ; SSE-NEXT: movaps 16(%rdx), %xmm4 -; SSE-NEXT: movaps (%rcx), %xmm6 +; SSE-NEXT: movaps (%rcx), %xmm8 ; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps %xmm3, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE-NEXT: movaps %xmm1, 96(%r8) -; SSE-NEXT: movaps %xmm3, 112(%r8) -; SSE-NEXT: movaps %xmm6, 64(%r8) -; SSE-NEXT: movaps %xmm7, 80(%r8) +; SSE-NEXT: movaps %xmm6, 112(%r8) +; SSE-NEXT: movaps %xmm8, 64(%r8) +; SSE-NEXT: movaps %xmm10, 80(%r8) ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps %xmm5, 48(%r8) ; SSE-NEXT: movaps %xmm2, (%r8) -; SSE-NEXT: movaps %xmm10, 16(%r8) +; SSE-NEXT: movaps %xmm3, 16(%r8) ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i32_stride4_vf8: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovaps (%rdi), %xmm2 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm10 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX1-NEXT: vmovaps (%rsi), %xmm4 ; AVX1-NEXT: vmovaps 16(%rsi), %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm10[1],xmm1[1],zero,zero -; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm8 +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[1],xmm1[1],zero,zero +; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-NEXT: vmovaps (%rcx), %xmm5 ; AVX1-NEXT: vmovaps 16(%rcx), %xmm6 ; AVX1-NEXT: vmovaps (%rdx), %xmm7 -; AVX1-NEXT: vmovaps 16(%rdx), %xmm3 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm3[0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2,3],ymm8[4,5],ymm0[6,7] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm4[1],zero,zero -; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm9 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm11 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm7[0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX1-NEXT: vmovaps 16(%rdx), %xmm8 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm10 = xmm6[0],xmm8[0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,0] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] +; AVX1-NEXT: vinsertps {{.*#+}} xmm9 = xmm2[1],xmm4[1],zero,zero +; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm11 = xmm5[0],xmm7[0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,2,0] +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm7[2],xmm5[2] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,0],xmm2[3,0] ; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm6[2] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,0],xmm10[3,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,0,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX1-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm8[2],xmm6[2] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] +; AVX1-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-NEXT: vmovaps %ymm2, 32(%r8) ; AVX1-NEXT: vmovaps %ymm9, (%r8) -; AVX1-NEXT: vmovaps %ymm8, 64(%r8) +; AVX1-NEXT: vmovaps %ymm3, 64(%r8) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: store_i32_stride4_vf8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-NEXT: vmovaps (%rdx), %ymm2 ; AVX2-NEXT: vmovaps (%rcx), %ymm3 @@ -320,28 +320,28 @@ ; AVX2-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-NEXT: vmovaps (%rsi), %xmm7 -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm9 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; AVX2-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-NEXT: vunpckhps {{.*#+}} xmm9 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] ; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[4],ymm1[4],ymm8[5],ymm1[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[6],ymm1[6],ymm8[7],ymm1[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX2-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-NEXT: vmovaps %ymm0, (%r8) +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-NEXT: vmovaps %ymm4, (%r8) ; AVX2-NEXT: vmovaps %ymm6, 32(%r8) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -378,90 +378,90 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride4_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm10 -; SSE-NEXT: movaps 16(%rdi), %xmm13 -; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rdi), %xmm4 -; SSE-NEXT: movaps (%rsi), %xmm3 -; SSE-NEXT: movaps 16(%rsi), %xmm1 +; SSE-NEXT: movaps (%rdi), %xmm5 +; SSE-NEXT: movaps 16(%rdi), %xmm11 +; SSE-NEXT: movaps 32(%rdi), %xmm4 +; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps (%rsi), %xmm0 +; SSE-NEXT: movaps 16(%rsi), %xmm3 ; SSE-NEXT: movaps 32(%rsi), %xmm9 -; SSE-NEXT: movaps (%rdx), %xmm0 -; SSE-NEXT: movaps 16(%rdx), %xmm5 -; SSE-NEXT: movaps 32(%rdx), %xmm6 -; SSE-NEXT: movaps (%rcx), %xmm11 +; SSE-NEXT: movaps (%rdx), %xmm7 +; SSE-NEXT: movaps 16(%rdx), %xmm13 +; SSE-NEXT: movaps 32(%rdx), %xmm10 +; SSE-NEXT: movaps (%rcx), %xmm8 ; SSE-NEXT: movaps 16(%rcx), %xmm14 ; SSE-NEXT: movaps 32(%rcx), %xmm12 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm3[2],xmm10[3],xmm3[3] -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movaps %xmm13, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm14[2],xmm5[3],xmm14[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; SSE-NEXT: movaps %xmm13, %xmm11 -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm5[0] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps 48(%rdx), %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; SSE-NEXT: movaps %xmm11, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm15[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; SSE-NEXT: movaps %xmm11, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0] +; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] +; SSE-NEXT: movaps %xmm4, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0] +; SSE-NEXT: movaps 48(%rdx), %xmm15 +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; SSE-NEXT: movaps 48(%rcx), %xmm12 -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm6[1] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] -; SSE-NEXT: movaps 48(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, 224(%r8) -; SSE-NEXT: movaps %xmm2, 240(%r8) +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm10[0] +; SSE-NEXT: movaps %xmm15, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; SSE-NEXT: movaps 48(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm15[0] +; SSE-NEXT: movaps %xmm2, 224(%r8) +; SSE-NEXT: movaps %xmm1, 240(%r8) ; SSE-NEXT: movaps %xmm3, 192(%r8) -; SSE-NEXT: movaps %xmm1, 208(%r8) -; SSE-NEXT: movaps %xmm8, 160(%r8) +; SSE-NEXT: movaps %xmm0, 208(%r8) +; SSE-NEXT: movaps %xmm4, 160(%r8) ; SSE-NEXT: movaps %xmm9, 176(%r8) -; SSE-NEXT: movaps %xmm5, 128(%r8) +; SSE-NEXT: movaps %xmm13, 128(%r8) ; SSE-NEXT: movaps %xmm14, 144(%r8) -; SSE-NEXT: movaps %xmm13, 96(%r8) -; SSE-NEXT: movaps %xmm11, 112(%r8) +; SSE-NEXT: movaps %xmm11, 96(%r8) +; SSE-NEXT: movaps %xmm8, 112(%r8) ; SSE-NEXT: movaps %xmm7, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movaps %xmm10, 32(%r8) +; SSE-NEXT: movaps %xmm5, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movaps %xmm15, (%r8) +; SSE-NEXT: movaps %xmm6, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: retq @@ -469,100 +469,100 @@ ; AVX1-LABEL: store_i32_stride4_vf16: ; AVX1: # %bb.0: ; AVX1-NEXT: subq $24, %rsp -; AVX1-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX1-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX1-NEXT: vmovaps 48(%rdi), %xmm11 -; AVX1-NEXT: vmovaps 16(%rsi), %xmm13 -; AVX1-NEXT: vmovaps 32(%rsi), %xmm14 -; AVX1-NEXT: vmovaps 48(%rsi), %xmm10 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[1],xmm10[1],zero,zero -; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8 -; AVX1-NEXT: vmovaps 16(%rcx), %xmm12 -; AVX1-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX1-NEXT: vmovaps 48(%rcx), %xmm2 -; AVX1-NEXT: vmovaps 16(%rdx), %xmm15 -; AVX1-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX1-NEXT: vmovaps 48(%rdx), %xmm4 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm2[0],xmm4[0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,2,0] -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[1],xmm14[1],zero,zero -; AVX1-NEXT: vunpcklps {{.*#+}} xmm8 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm8 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm3[0],xmm1[0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,0] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm7[1],xmm13[1],zero,zero -; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm12[0],xmm15[0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,0] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm7 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] +; AVX1-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm7 +; AVX1-NEXT: vmovaps 16(%rsi), %xmm1 +; AVX1-NEXT: vmovaps 32(%rsi), %xmm4 +; AVX1-NEXT: vmovaps 48(%rsi), %xmm9 +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm7[1],xmm9[1],zero,zero +; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vmovaps 16(%rcx), %xmm5 +; AVX1-NEXT: vmovaps 32(%rcx), %xmm10 +; AVX1-NEXT: vmovaps 48(%rcx), %xmm13 +; AVX1-NEXT: vmovaps 16(%rdx), %xmm6 +; AVX1-NEXT: vmovaps 32(%rdx), %xmm11 +; AVX1-NEXT: vmovaps 48(%rdx), %xmm14 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm8 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm13[0],xmm14[0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[0,1,2,0] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm8 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3],ymm3[4,5],ymm8[6,7] +; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm2[1],xmm4[1],zero,zero +; AVX1-NEXT: vunpcklps {{.*#+}} xmm8 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm10[0],xmm11[0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[0,1,2,0] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm8 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm8[2,3],ymm3[4,5],ymm8[6,7] +; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[1],xmm1[1],zero,zero +; AVX1-NEXT: vunpcklps {{.*#+}} xmm12 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm5[0],xmm6[0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[0,1,2,0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm12[2,3],ymm3[4,5],ymm12[6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-NEXT: vmovaps (%rsi), %xmm7 -; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm8[1],xmm7[1],zero,zero -; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm9 -; AVX1-NEXT: vmovaps (%rcx), %xmm6 +; AVX1-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-NEXT: vmovaps (%rsi), %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm15 = xmm3[1],xmm1[1],zero,zero +; AVX1-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm8, %ymm8 +; AVX1-NEXT: vmovaps (%rcx), %xmm15 ; AVX1-NEXT: vmovaps (%rdx), %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm0[0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,2,0] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[2],xmm6[2] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,0],xmm8[3,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm4[2],xmm2[2] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm10[3,0],xmm11[3,0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm0[0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[0,1,2,0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[2],xmm15[2] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[3,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,0,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm14[2],xmm13[2] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm9[3,0],xmm7[3,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm11[2],xmm10[2] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,0],xmm7[3,0] ; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,0,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm1[2],xmm3[2] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm14[3,0],xmm3[3,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,0,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm15[2],xmm12[2] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm6[2],xmm5[2] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,0],xmm5[3,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,0,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm6[3,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] ; AVX1-NEXT: vmovaps %ymm2, 96(%r8) ; AVX1-NEXT: vmovaps %ymm1, 160(%r8) ; AVX1-NEXT: vmovaps %ymm0, 224(%r8) -; AVX1-NEXT: vmovaps %ymm8, 32(%r8) -; AVX1-NEXT: vmovaps %ymm9, (%r8) +; AVX1-NEXT: vmovaps %ymm3, 32(%r8) +; AVX1-NEXT: vmovaps %ymm8, (%r8) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 64(%r8) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -575,69 +575,69 @@ ; ; AVX2-LABEL: store_i32_stride4_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps (%rdi), %ymm9 -; AVX2-NEXT: vmovaps (%rcx), %xmm10 +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps (%rcx), %xmm4 ; AVX2-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-NEXT: vmovaps (%rdx), %xmm5 ; AVX2-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-NEXT: vmovaps (%rsi), %xmm7 -; AVX2-NEXT: vmovaps 32(%rsi), %xmm2 -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm8 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm11 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX2-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] ; AVX2-NEXT: vmovaps 32(%rdi), %ymm11 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-NEXT: vmovaps (%rsi), %ymm6 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX2-NEXT: vmovaps 32(%rsi), %ymm10 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX2-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] -; AVX2-NEXT: vmovaps 32(%rcx), %ymm10 -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5],ymm5[6,7] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm2[0],ymm10[0],ymm2[1],ymm10[1],ymm2[4],ymm10[4],ymm2[5],ymm10[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] +; AVX2-NEXT: vmovaps 32(%rdx), %ymm8 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-NEXT: vmovaps 32(%rcx), %ymm5 +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX2-NEXT: vmovaps (%rdx), %ymm9 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm5 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] +; AVX2-NEXT: vmovaps (%rcx), %ymm8 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[4],ymm4[4],ymm11[5],ymm4[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-NEXT: vmovaps (%rdx), %ymm7 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[6],ymm10[6],ymm2[7],ymm10[7] -; AVX2-NEXT: vmovaps (%rcx), %ymm10 -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[6],ymm4[6],ymm11[7],ymm4[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm7[0],ymm10[0],ymm7[1],ymm10[1],ymm7[4],ymm10[4],ymm7[5],ymm10[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[4],ymm6[4],ymm9[5],ymm6[5] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3],ymm11[4,5],ymm4[6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm7 = ymm7[2],ymm10[2],ymm7[3],ymm10[3],ymm7[6],ymm10[6],ymm7[7],ymm10[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] -; AVX2-NEXT: vmovaps %ymm6, 96(%r8) -; AVX2-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-NEXT: vmovaps %ymm2, 224(%r8) -; AVX2-NEXT: vmovaps %ymm5, 192(%r8) -; AVX2-NEXT: vmovaps %ymm0, (%r8) +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm8 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm8[0,2,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] +; AVX2-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-NEXT: vmovaps %ymm10, 64(%r8) +; AVX2-NEXT: vmovaps %ymm5, 224(%r8) +; AVX2-NEXT: vmovaps %ymm7, 192(%r8) +; AVX2-NEXT: vmovaps %ymm4, (%r8) ; AVX2-NEXT: vmovaps %ymm3, 160(%r8) -; AVX2-NEXT: vmovaps %ymm8, 128(%r8) +; AVX2-NEXT: vmovaps %ymm2, 128(%r8) ; AVX2-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -695,133 +695,133 @@ ; SSE-LABEL: store_i32_stride4_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $184, %rsp -; SSE-NEXT: movaps (%rdi), %xmm14 +; SSE-NEXT: movaps (%rdi), %xmm10 ; SSE-NEXT: movaps 16(%rdi), %xmm11 ; SSE-NEXT: movaps 32(%rdi), %xmm12 ; SSE-NEXT: movaps 48(%rdi), %xmm13 -; SSE-NEXT: movaps (%rsi), %xmm15 -; SSE-NEXT: movaps 16(%rsi), %xmm9 -; SSE-NEXT: movaps 32(%rsi), %xmm8 -; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: movaps (%rsi), %xmm5 +; SSE-NEXT: movaps 16(%rsi), %xmm2 +; SSE-NEXT: movaps 32(%rsi), %xmm0 +; SSE-NEXT: movaps (%rdx), %xmm6 ; SSE-NEXT: movaps 16(%rdx), %xmm4 ; SSE-NEXT: movaps 32(%rdx), %xmm1 ; SSE-NEXT: movaps (%rcx), %xmm7 -; SSE-NEXT: movaps 16(%rcx), %xmm0 -; SSE-NEXT: movaps 32(%rcx), %xmm10 -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: movaps %xmm14, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; SSE-NEXT: movaps %xmm14, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] +; SSE-NEXT: movaps 16(%rcx), %xmm8 +; SSE-NEXT: movaps 32(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm9[0] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; SSE-NEXT: movaps %xmm11, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; SSE-NEXT: movaps %xmm11, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movaps %xmm12, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdx), %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; SSE-NEXT: movaps 48(%rcx), %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; SSE-NEXT: movaps %xmm12, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdx), %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movaps 48(%rcx), %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movaps 48(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm13, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] -; SSE-NEXT: movaps %xmm15, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps 48(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdx), %xmm0 ; SSE-NEXT: movaps 64(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: movaps 64(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rsi), %xmm4 +; SSE-NEXT: movaps 64(%rsi), %xmm5 ; SSE-NEXT: movaps %xmm10, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] ; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; SSE-NEXT: movaps %xmm10, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps 80(%rdx), %xmm0 ; SSE-NEXT: movaps 80(%rcx), %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE-NEXT: movaps 80(%rdi), %xmm5 -; SSE-NEXT: movaps 80(%rsi), %xmm6 -; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: movaps 80(%rsi), %xmm7 +; SSE-NEXT: movaps %xmm5, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm6[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps 96(%rdx), %xmm1 -; SSE-NEXT: movaps 96(%rcx), %xmm6 +; SSE-NEXT: movaps 96(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movaps 96(%rdi), %xmm3 ; SSE-NEXT: movaps 96(%rsi), %xmm7 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: movaps %xmm4, %xmm13 +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE-NEXT: movaps %xmm6, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] ; SSE-NEXT: movaps %xmm3, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0] @@ -834,8 +834,8 @@ ; SSE-NEXT: movaps 112(%rsi), %xmm8 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] @@ -845,15 +845,15 @@ ; SSE-NEXT: movaps %xmm0, 496(%r8) ; SSE-NEXT: movaps %xmm7, 480(%r8) ; SSE-NEXT: movaps %xmm1, 464(%r8) -; SSE-NEXT: movaps %xmm6, 448(%r8) +; SSE-NEXT: movaps %xmm4, 448(%r8) ; SSE-NEXT: movaps %xmm3, 432(%r8) ; SSE-NEXT: movaps %xmm11, 416(%r8) -; SSE-NEXT: movaps %xmm4, 400(%r8) +; SSE-NEXT: movaps %xmm6, 400(%r8) ; SSE-NEXT: movaps %xmm13, 384(%r8) ; SSE-NEXT: movaps %xmm5, 368(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%r8) -; SSE-NEXT: movaps %xmm14, 336(%r8) +; SSE-NEXT: movaps %xmm15, 336(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%r8) ; SSE-NEXT: movaps %xmm10, 304(%r8) @@ -866,7 +866,7 @@ ; SSE-NEXT: movaps %xmm0, 240(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r8) -; SSE-NEXT: movaps %xmm15, 208(%r8) +; SSE-NEXT: movaps %xmm14, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -923,12 +923,12 @@ ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX1-NEXT: vmovaps 64(%rcx), %xmm2 ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -942,10 +942,10 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovaps 80(%rcx), %xmm2 ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 80(%rdx), %xmm7 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm7[0] +; AVX1-NEXT: vmovaps 80(%rdx), %xmm12 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm12[0] ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -956,41 +956,41 @@ ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovaps 32(%rcx), %xmm13 -; AVX1-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm10[0] +; AVX1-NEXT: vmovaps 32(%rcx), %xmm10 +; AVX1-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm8[0] ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-NEXT: vmovaps 48(%rsi), %xmm1 ; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovaps 48(%rcx), %xmm11 -; AVX1-NEXT: vmovaps 48(%rdx), %xmm9 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm9[0] +; AVX1-NEXT: vmovaps 48(%rcx), %xmm9 +; AVX1-NEXT: vmovaps 48(%rdx), %xmm7 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm7[0] ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps 96(%rdi), %xmm2 ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 96(%rsi), %xmm1 -; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovaps 96(%rcx), %xmm12 +; AVX1-NEXT: vmovaps 96(%rcx), %xmm6 ; AVX1-NEXT: vmovaps 96(%rdx), %xmm5 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm5[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm5[0] ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1005,120 +1005,118 @@ ; AVX1-NEXT: vmovaps 112(%rdx), %xmm3 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0] ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps (%rdi), %xmm2 -; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps (%rsi), %xmm1 +; AVX1-NEXT: vmovaps (%rdi), %xmm1 ; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero -; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-NEXT: vmovaps (%rsi), %xmm13 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[1],xmm13[1],zero,zero +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovaps (%rcx), %xmm8 +; AVX1-NEXT: vmovaps (%rcx), %xmm11 ; AVX1-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm8[0],xmm2[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm11[0],xmm2[0] ; AVX1-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,0] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] ; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 -; AVX1-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm14[2,3],ymm0[4,5],ymm14[6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5],ymm14[6,7] +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm6[2] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm14[2] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm1 -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,0],xmm0[3,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm14, %ymm6 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm7[2],xmm1[2] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,0],xmm7[3,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[2,0,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm10[2],xmm13[2] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm12[2],xmm1[2] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm9[2],xmm11[2] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm14 = xmm1[3,0],xmm14[3,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[2,0,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm8 = zero,zero,xmm8[2],xmm10[2] +; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,0],xmm1[3,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[2,0,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm5[2],xmm12[2] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm10 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,0],xmm1[3,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[2,0,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm7[2],xmm9[2] +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 ; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm9 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],xmm1[3,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[2,0,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm9 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm5[2],xmm6[2] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm9 = xmm1[3,0],xmm9[3,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,0,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm4[2] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,0],xmm6[3,0] ; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm4[2] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm11[2] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[3,0],xmm4[3,0] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm13[3,0],xmm4[3,0] ; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,0,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm8[2] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[3,0],xmm3[3,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,0,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm4[2] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,0],xmm6[3,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,0,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm4[2],xmm6[2] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,0],xmm9[3,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] ; AVX1-NEXT: vmovaps %ymm2, 96(%r8) ; AVX1-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-NEXT: vmovaps %ymm0, 480(%r8) +; AVX1-NEXT: vmovaps %ymm3, 480(%r8) ; AVX1-NEXT: vmovaps %ymm5, 416(%r8) -; AVX1-NEXT: vmovaps %ymm9, 224(%r8) -; AVX1-NEXT: vmovaps %ymm10, 160(%r8) -; AVX1-NEXT: vmovaps %ymm14, 352(%r8) +; AVX1-NEXT: vmovaps %ymm7, 224(%r8) +; AVX1-NEXT: vmovaps %ymm8, 160(%r8) +; AVX1-NEXT: vmovaps %ymm0, 352(%r8) +; AVX1-NEXT: vmovaps %ymm15, 288(%r8) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm0, 288(%r8) -; AVX1-NEXT: vmovaps %ymm15, (%r8) +; AVX1-NEXT: vmovaps %ymm0, (%r8) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 448(%r8) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1146,133 +1144,133 @@ ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 64(%rdx), %ymm6 ; AVX2-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-NEXT: vmovaps (%rcx), %xmm13 -; AVX2-NEXT: vmovaps 32(%rcx), %xmm10 -; AVX2-NEXT: vmovaps 64(%rcx), %xmm11 -; AVX2-NEXT: vmovaps (%rdx), %xmm14 -; AVX2-NEXT: vmovaps 32(%rdx), %xmm12 -; AVX2-NEXT: vmovaps 64(%rdx), %xmm3 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX2-NEXT: vmovaps 64(%rsi), %xmm7 -; AVX2-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX2-NEXT: vmovaps (%rcx), %xmm9 +; AVX2-NEXT: vmovaps 32(%rcx), %xmm7 +; AVX2-NEXT: vmovaps 64(%rcx), %xmm4 +; AVX2-NEXT: vmovaps (%rdx), %xmm11 +; AVX2-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX2-NEXT: vmovaps 64(%rdx), %xmm5 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; AVX2-NEXT: vmovaps 32(%rsi), %xmm10 +; AVX2-NEXT: vmovaps 64(%rsi), %xmm12 +; AVX2-NEXT: vmovaps 32(%rdi), %xmm13 +; AVX2-NEXT: vmovaps 64(%rdi), %xmm14 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] ; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovaps 96(%rcx), %xmm10 -; AVX2-NEXT: vmovaps 96(%rdx), %xmm3 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,0,2,1] -; AVX2-NEXT: vmovaps 96(%rsi), %xmm4 -; AVX2-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm12 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; AVX2-NEXT: vmovaps 96(%rdx), %xmm12 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-NEXT: vmovaps 96(%rsi), %xmm13 +; AVX2-NEXT: vmovaps 96(%rdi), %xmm14 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps (%rsi), %xmm15 +; AVX2-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX2-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rsi), %xmm1 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; AVX2-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%rcx), %ymm0 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm12 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm13 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; AVX2-NEXT: vmovaps (%rcx), %ymm13 -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX2-NEXT: vmovaps %ymm2, %ymm5 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[4],ymm13[4],ymm2[5],ymm13[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm9[2,3],ymm0[4,5],ymm9[6,7] +; AVX2-NEXT: vmovaps %ymm2, %ymm8 +; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[4],ymm13[4],ymm2[5],ymm13[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovaps 64(%rdi), %ymm10 -; AVX2-NEXT: vmovaps 64(%rsi), %ymm14 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[6],ymm0[6],ymm6[7],ymm0[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[4],ymm1[4],ymm6[5],ymm1[5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7] -; AVX2-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-NEXT: vmovaps 32(%rcx), %ymm9 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm10 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3] -; AVX2-NEXT: vmovaps 32(%rdi), %ymm14 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm15 +; AVX2-NEXT: vmovaps 64(%rsi), %ymm14 +; AVX2-NEXT: vunpcklps {{.*#+}} ymm3 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[4],ymm14[4],ymm15[5],ymm14[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[6],ymm1[6],ymm6[7],ymm1[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX2-NEXT: vmovaps 32(%rdx), %ymm3 +; AVX2-NEXT: vmovaps 32(%rcx), %ymm6 +; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,2,3] +; AVX2-NEXT: vmovaps 32(%rdi), %ymm15 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm0 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5],ymm14[6,7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[6],ymm0[6],ymm15[7],ymm0[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] -; AVX2-NEXT: vmovaps 96(%rdx), %ymm6 -; AVX2-NEXT: vmovaps 96(%rcx), %ymm9 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-NEXT: vmovaps 96(%rdi), %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX2-NEXT: vmovaps 96(%rdx), %ymm3 +; AVX2-NEXT: vmovaps 96(%rcx), %ymm6 +; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,2,3] +; AVX2-NEXT: vmovaps 96(%rdi), %ymm15 ; AVX2-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm8 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3],ymm7[4,5],ymm14[6,7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[6],ymm0[6],ymm15[7],ymm0[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[6],ymm13[6],ymm5[7],ymm13[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] -; AVX2-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm13[2],ymm8[3],ymm13[3],ymm8[6],ymm13[6],ymm8[7],ymm13[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7] +; AVX2-NEXT: vmovaps %ymm3, 96(%r8) ; AVX2-NEXT: vmovaps %ymm0, 480(%r8) -; AVX2-NEXT: vmovaps %ymm8, 448(%r8) +; AVX2-NEXT: vmovaps %ymm7, 448(%r8) ; AVX2-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-NEXT: vmovaps %ymm10, 192(%r8) -; AVX2-NEXT: vmovaps %ymm4, 352(%r8) -; AVX2-NEXT: vmovaps %ymm7, 320(%r8) -; AVX2-NEXT: vmovaps %ymm12, 64(%r8) -; AVX2-NEXT: vmovaps %ymm15, (%r8) +; AVX2-NEXT: vmovaps %ymm4, 192(%r8) +; AVX2-NEXT: vmovaps %ymm9, 352(%r8) +; AVX2-NEXT: vmovaps %ymm10, 320(%r8) +; AVX2-NEXT: vmovaps %ymm11, 64(%r8) +; AVX2-NEXT: vmovaps %ymm12, (%r8) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -176,36 +176,36 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm8 +; SSE-NEXT: movaps (%rsi), %xmm2 ; SSE-NEXT: movaps (%rdx), %xmm1 -; SSE-NEXT: movaps (%rcx), %xmm9 -; SSE-NEXT: movaps (%r8), %xmm2 +; SSE-NEXT: movaps (%rcx), %xmm4 +; SSE-NEXT: movaps (%r8), %xmm5 ; SSE-NEXT: movaps (%r9), %xmm6 ; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm6[1,1] -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm4[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm6[1,1] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm6[3,3] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm7[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[2,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm7[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm9[0,2] ; SSE-NEXT: movaps %xmm6, 16(%rax) ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps %xmm1, 80(%rax) -; SSE-NEXT: movaps %xmm5, 64(%rax) +; SSE-NEXT: movaps %xmm8, 64(%rax) ; SSE-NEXT: movaps %xmm3, 32(%rax) -; SSE-NEXT: movaps %xmm2, (%rax) +; SSE-NEXT: movaps %xmm5, (%rax) ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i32_stride6_vf4: @@ -218,22 +218,22 @@ ; AVX1-NEXT: vmovaps (%r8), %xmm4 ; AVX1-NEXT: vmovaps (%r9), %xmm5 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm13 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm7 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm9 ; AVX1-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm11 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] ; AVX1-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] -; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm3[0,0],xmm1[0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[0,1,2,0] -; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7] +; AVX1-NEXT: vshufps {{.*#+}} xmm13 = xmm3[0,0],xmm1[0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] ; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm10 -; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm13[1,2],ymm10[1,2],ymm13[5,6],ymm10[5,6] -; AVX1-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5] -; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5],ymm10[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm11 +; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm11[1,2],ymm7[5,6],ymm11[5,6] +; AVX1-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] +; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] @@ -245,7 +245,7 @@ ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] ; AVX1-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-NEXT: vmovaps %ymm4, 32(%rax) -; AVX1-NEXT: vmovaps %ymm7, (%rax) +; AVX1-NEXT: vmovaps %ymm10, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -335,73 +335,71 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps (%rdi), %xmm2 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps (%rsi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rsi), %xmm13 -; SSE-NEXT: movaps (%rdx), %xmm4 +; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps (%rsi), %xmm9 +; SSE-NEXT: movaps 16(%rsi), %xmm12 +; SSE-NEXT: movaps (%rdx), %xmm3 ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: movaps (%rcx), %xmm11 -; SSE-NEXT: movaps 16(%rcx), %xmm15 +; SSE-NEXT: movaps 16(%rcx), %xmm13 ; SSE-NEXT: movaps (%r8), %xmm10 -; SSE-NEXT: movaps 16(%r8), %xmm0 +; SSE-NEXT: movaps 16(%r8), %xmm14 ; SSE-NEXT: movaps (%r9), %xmm7 ; SSE-NEXT: movaps 16(%r9), %xmm5 -; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: movaps %xmm14, %xmm6 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm5[3,3] -; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm6[0,2] -; SSE-NEXT: movaps %xmm8, %xmm14 -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm14[2,3] -; SSE-NEXT: movaps %xmm15, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm3[0,2] -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: movaps %xmm10, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3] -; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,0] -; SSE-NEXT: movaps %xmm10, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm7[1,1] -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm6[0,2] +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSE-NEXT: movaps %xmm14, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,0] +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm5[1,1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm14[0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm15[0,2] +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm7[3,3] +; SSE-NEXT: movaps %xmm3, %xmm12 +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm13[0,2] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm13[2,3] +; SSE-NEXT: movaps %xmm11, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,0] +; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm7[1,1] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm6[0,2] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm15[0,2] ; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: movaps %xmm7, 16(%rax) -; SSE-NEXT: movaps %xmm4, 32(%rax) -; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm3, 64(%rax) -; SSE-NEXT: movaps %xmm13, 80(%rax) -; SSE-NEXT: movaps %xmm8, 96(%rax) +; SSE-NEXT: movaps %xmm3, 32(%rax) +; SSE-NEXT: movaps %xmm13, 48(%rax) +; SSE-NEXT: movaps %xmm14, 64(%rax) +; SSE-NEXT: movaps %xmm12, 80(%rax) +; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps %xmm5, 112(%rax) ; SSE-NEXT: movaps %xmm1, 128(%rax) -; SSE-NEXT: movaps %xmm14, 144(%rax) -; SSE-NEXT: movaps %xmm12, 160(%rax) -; SSE-NEXT: movaps %xmm9, 176(%rax) +; SSE-NEXT: movaps %xmm6, 144(%rax) +; SSE-NEXT: movaps %xmm8, 160(%rax) +; SSE-NEXT: movaps %xmm4, 176(%rax) ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i32_stride6_vf8: @@ -412,29 +410,29 @@ ; AVX1-NEXT: vmovaps (%rdx), %ymm5 ; AVX1-NEXT: vmovaps (%rcx), %ymm6 ; AVX1-NEXT: vmovaps (%r8), %ymm3 -; AVX1-NEXT: vmovaps (%rcx), %xmm13 -; AVX1-NEXT: vmovaps (%rdx), %xmm14 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,2],xmm13[1,2] +; AVX1-NEXT: vmovaps (%rcx), %xmm1 +; AVX1-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,2],xmm1[1,2] ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm9 -; AVX1-NEXT: vmovaps (%rsi), %xmm0 -; AVX1-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm10 -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX1-NEXT: vbroadcastss 4(%r8), %xmm10 -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] -; AVX1-NEXT: vbroadcastss 4(%r9), %ymm10 -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3],ymm9[4,5,6,7] -; AVX1-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps (%rsi), %xmm9 +; AVX1-NEXT: vmovaps (%rdi), %xmm10 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] +; AVX1-NEXT: vbroadcastss 4(%r8), %xmm7 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vbroadcastss 4(%r9), %ymm7 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3],ymm0[4,5,6,7] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX1-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm3[4,5],ymm10[6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3],ymm7[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5],ymm7[6,7] ; AVX1-NEXT: vbroadcastss 16(%r9), %ymm12 -; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm12[5],ymm7[6,7] ; AVX1-NEXT: vunpckhps {{.*#+}} ymm11 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm5[1,2],ymm6[1,2],ymm5[5,6],ymm6[5,6] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3] @@ -444,281 +442,281 @@ ; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] ; AVX1-NEXT: vbroadcastss 20(%r9), %ymm12 ; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm12[3],ymm8[4,5,6,7] -; AVX1-NEXT: vbroadcastss (%rcx), %xmm1 -; AVX1-NEXT: vbroadcastss (%rdx), %xmm2 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, (%r8), %ymm0, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX1-NEXT: vbroadcastss (%r9), %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-NEXT: vbroadcastss (%rcx), %xmm12 +; AVX1-NEXT: vbroadcastss (%rdx), %xmm13 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm10 +; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, (%r8), %ymm9, %ymm9 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] +; AVX1-NEXT: vbroadcastss (%r9), %ymm10 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm11[2,3],ymm5[2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5],ymm3[6,7] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6],ymm5[7] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,1,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-NEXT: vmovaps (%r9), %xmm2 +; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[0,2,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] -; AVX1-NEXT: vmovaps (%r9), %xmm3 -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[0,2,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] -; AVX1-NEXT: vmovaps %ymm2, 64(%rax) -; AVX1-NEXT: vmovaps %ymm1, 160(%rax) -; AVX1-NEXT: vmovaps %ymm0, (%rax) +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX1-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-NEXT: vmovaps %ymm3, 160(%rax) +; AVX1-NEXT: vmovaps %ymm9, (%rax) ; AVX1-NEXT: vmovaps %ymm8, 128(%rax) -; AVX1-NEXT: vmovaps %ymm10, 96(%rax) -; AVX1-NEXT: vmovaps %ymm9, 32(%rax) +; AVX1-NEXT: vmovaps %ymm7, 96(%rax) +; AVX1-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride6_vf8: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[0,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm0 -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%r9), %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm6 -; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm1[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm14, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm10[0],zero,xmm10[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 4(%r9), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm12 +; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm13 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm10, %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm12, %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm13, 160(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm11, 128(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: store_i32_stride6_vf8: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm9 -; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %ymm12 -; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm8 -; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm14 -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,1,2,2] -; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm15 -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[1,1,2,3] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2],xmm7[3] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5],ymm6[6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm7 -; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpbroadcastd 4(%r9), %ymm8 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpbroadcastd (%rcx), %xmm6 -; AVX2-FAST-ALL-NEXT: vpbroadcastd (%rdx), %xmm0 -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm7, %ymm1 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-ALL-NEXT: vpbroadcastd (%r9), %ymm1 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[6],ymm11[6],ymm9[7],ymm11[7] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4],ymm1[5],ymm5[6],ymm1[7] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpbroadcastd 20(%r9), %ymm5 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3],ymm1[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX2-FAST-ALL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = <6,u,u,u,u,u,7,u> -; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm5, %ymm5 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-ALL-NEXT: vpermd %ymm12, %ymm5, %ymm5 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] -; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm5, %ymm13, %ymm5 -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-ALL-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm6, %ymm7 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5],ymm7[6,7] -; AVX2-FAST-ALL-NEXT: vpermd %ymm12, %ymm6, %ymm6 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6],ymm6[7] +; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %ymm5 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm10 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm6 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,1,2,2] +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm9 +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[1,1,2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm12 +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpbroadcastd 4(%r9), %ymm13 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3],ymm6[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpbroadcastd (%rcx), %xmm13 +; AVX2-FAST-ALL-NEXT: vpbroadcastd (%rdx), %xmm14 +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm12, %ymm11 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-FAST-ALL-NEXT: vpbroadcastd (%r9), %ymm11 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpbroadcastd 20(%r9), %ymm13 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-FAST-ALL-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm13 = <6,u,u,u,u,u,7,u> +; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm13, %ymm13 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-ALL-NEXT: vpermd %ymm5, %ymm13, %ymm13 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3,4,5,6],ymm13[7] +; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-ALL-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm8, %ymm9 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm5, %ymm8, %ymm5 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4,5,6],ymm5[7] ; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[4],ymm11[4],ymm9[5],ymm11[5] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] -; AVX2-FAST-ALL-NEXT: vpbroadcastd 16(%r9), %ymm3 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-ALL-NEXT: vpbroadcastd 16(%r9), %ymm1 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, 64(%rax) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 128(%rax) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm11, 160(%rax) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm12, 128(%rax) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm10, (%rax) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm8, 32(%rax) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm6, 32(%rax) ; AVX2-FAST-ALL-NEXT: vzeroupper ; AVX2-FAST-ALL-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride6_vf8: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%r9), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm2, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm1[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm14, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm10[0],zero,xmm10[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%r9), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = mem[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -772,107 +770,107 @@ ; SSE-LABEL: store_i32_stride6_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $72, %rsp -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm10 -; SSE-NEXT: movaps (%rsi), %xmm11 -; SSE-NEXT: movaps 16(%rsi), %xmm8 -; SSE-NEXT: movaps (%rdx), %xmm13 -; SSE-NEXT: movaps 16(%rdx), %xmm12 -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps (%r8), %xmm2 -; SSE-NEXT: movaps 16(%r8), %xmm7 -; SSE-NEXT: movaps (%r9), %xmm1 -; SSE-NEXT: movaps 16(%r9), %xmm14 -; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1] -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm5[0,2] +; SSE-NEXT: movaps (%rdi), %xmm4 +; SSE-NEXT: movaps 16(%rdi), %xmm5 +; SSE-NEXT: movaps (%rsi), %xmm8 +; SSE-NEXT: movaps 16(%rsi), %xmm11 +; SSE-NEXT: movaps (%rdx), %xmm6 +; SSE-NEXT: movaps 16(%rdx), %xmm7 +; SSE-NEXT: movaps (%rcx), %xmm1 +; SSE-NEXT: movaps 16(%rcx), %xmm14 +; SSE-NEXT: movaps (%r8), %xmm9 +; SSE-NEXT: movaps 16(%r8), %xmm15 +; SSE-NEXT: movaps (%r9), %xmm2 +; SSE-NEXT: movaps 16(%r9), %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm3[2,3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm3[0,2] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE-NEXT: movaps %xmm9, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm4[2,3] +; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movaps %xmm14, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm2[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm9[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm14[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[0,2] +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm15[0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm12[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm14[3,3] -; SSE-NEXT: movaps 32(%rdx), %xmm11 -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdi), %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm0[3,3] +; SSE-NEXT: movaps 32(%rdx), %xmm13 +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm7[0,2] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm15[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: movaps 32(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm10, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movaps 32(%r8), %xmm2 -; SSE-NEXT: movaps 32(%r9), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] +; SSE-NEXT: movaps 32(%r9), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm14[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] ; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm3[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm2[0,2] ; SSE-NEXT: movaps 48(%rdx), %xmm2 -; SSE-NEXT: movaps 48(%rcx), %xmm8 +; SSE-NEXT: movaps 48(%rcx), %xmm9 ; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] ; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps 48(%rsi), %xmm14 +; SSE-NEXT: movaps 48(%rsi), %xmm10 ; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] ; SSE-NEXT: movaps 48(%r8), %xmm3 ; SSE-NEXT: movaps 48(%r9), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm6 @@ -882,29 +880,29 @@ ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: movaps %xmm8, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm7[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm2, 368(%rax) -; SSE-NEXT: movaps %xmm1, 352(%rax) +; SSE-NEXT: movaps %xmm10, 352(%rax) ; SSE-NEXT: movaps %xmm0, 336(%rax) ; SSE-NEXT: movaps %xmm4, 320(%rax) ; SSE-NEXT: movaps %xmm6, 304(%rax) ; SSE-NEXT: movaps %xmm5, 288(%rax) -; SSE-NEXT: movaps %xmm11, 272(%rax) -; SSE-NEXT: movaps %xmm9, 256(%rax) -; SSE-NEXT: movaps %xmm10, 240(%rax) -; SSE-NEXT: movaps %xmm12, 224(%rax) -; SSE-NEXT: movaps %xmm15, 208(%rax) -; SSE-NEXT: movaps %xmm13, 192(%rax) +; SSE-NEXT: movaps %xmm13, 272(%rax) +; SSE-NEXT: movaps %xmm8, 256(%rax) +; SSE-NEXT: movaps %xmm12, 240(%rax) +; SSE-NEXT: movaps %xmm15, 224(%rax) +; SSE-NEXT: movaps %xmm11, 208(%rax) +; SSE-NEXT: movaps %xmm14, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -935,7 +933,7 @@ ; AVX1-LABEL: store_i32_stride6_vf16: ; AVX1: # %bb.0: ; AVX1-NEXT: subq $136, %rsp -; AVX1-NEXT: vmovaps (%rdi), %ymm8 +; AVX1-NEXT: vmovaps (%rdi), %ymm6 ; AVX1-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX1-NEXT: vmovaps (%rsi), %ymm5 ; AVX1-NEXT: vmovaps 32(%rsi), %ymm2 @@ -955,23 +953,23 @@ ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps 32(%rcx), %xmm11 -; AVX1-NEXT: vmovaps 32(%rdx), %xmm7 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,2],xmm11[1,2] +; AVX1-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,2],xmm11[1,2] ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps 32(%rsi), %xmm1 ; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 32(%rdi), %xmm3 ; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-NEXT: vbroadcastss 36(%r8), %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vbroadcastss 36(%r9), %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-NEXT: vmovaps (%rcx), %ymm9 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm15[0],ymm9[2],ymm15[2] @@ -1010,18 +1008,18 @@ ; AVX1-NEXT: vbroadcastss 52(%r9), %ymm4 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpckhps {{.*#+}} ymm4 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] +; AVX1-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,2],ymm9[1,2],ymm15[5,6],ymm9[5,6] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX1-NEXT: vbroadcastss 20(%r8), %xmm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vbroadcastss 20(%r9), %ymm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX1-NEXT: vbroadcastss 20(%r8), %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vbroadcastss 20(%r9), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] ; AVX1-NEXT: vbroadcastss (%rcx), %xmm0 -; AVX1-NEXT: vbroadcastss (%rdx), %xmm5 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-NEXT: vbroadcastss (%rdx), %xmm6 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] @@ -1039,58 +1037,58 @@ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3] ; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 ; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] ; AVX1-NEXT: vmovaps 32(%r9), %xmm3 -; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[0,2,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm3[0,2,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] ; AVX1-NEXT: vbroadcastss 32(%rcx), %xmm1 -; AVX1-NEXT: vbroadcastss 32(%rdx), %xmm5 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm6 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, 32(%r8), %ymm5, %ymm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] -; AVX1-NEXT: vbroadcastss 32(%r9), %ymm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm9[3,0],ymm15[3,0],ymm9[7,4],ymm15[7,4] -; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3] -; AVX1-NEXT: vperm2f128 $51, (%rsp), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX1-NEXT: # ymm5 = mem[2,3,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6],ymm5[7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-NEXT: vmovaps (%r9), %xmm6 -; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm5 -; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = mem[2,1,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm7, %ymm7 -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5],ymm7[6,7] -; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[0,2,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-NEXT: vbroadcastss 32(%rdx), %xmm6 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, 32(%r8), %ymm6, %ymm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] +; AVX1-NEXT: vbroadcastss 32(%r9), %ymm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm9[3,0],ymm15[3,0],ymm9[7,4],ymm15[7,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm6[2,3] +; AVX1-NEXT: vperm2f128 $51, (%rsp), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-NEXT: # ymm6 = mem[2,3,2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5],ymm6[6,7] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3,2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6],ymm6[7] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-NEXT: vmovaps (%r9), %xmm7 +; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm14, %ymm6 +; AVX1-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,1,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 +; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5],ymm8[6,7] +; AVX1-NEXT: vpermilps {{.*#+}} xmm8 = xmm7[0,2,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-NEXT: vmovaps %ymm4, 160(%rax) +; AVX1-NEXT: vmovaps %ymm6, 64(%rax) +; AVX1-NEXT: vmovaps %ymm5, 160(%rax) ; AVX1-NEXT: vmovaps %ymm1, 192(%rax) ; AVX1-NEXT: vmovaps %ymm3, 256(%rax) ; AVX1-NEXT: vmovaps %ymm2, 352(%rax) ; AVX1-NEXT: vmovaps %ymm0, (%rax) -; AVX1-NEXT: vmovaps %ymm8, 128(%rax) +; AVX1-NEXT: vmovaps %ymm4, 128(%rax) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 320(%rax) ; AVX1-NEXT: vmovaps %ymm12, 32(%rax) @@ -1108,127 +1106,127 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $200, %rsp ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 36(%r9), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm12[0],zero,xmm12[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%r9), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm3 -; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm4 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm12 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm12[0],zero,xmm12[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 36(%r9), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 4(%r9), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm3 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm10, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm15 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm15, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm15, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 52(%r9), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm2 -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm4 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm11, %ymm2 +; AVX2-SLOW-NEXT: vpbroadcastd 52(%r9), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm2 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm6, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm5, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4],ymm4[5],ymm8[6],ymm4[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm10 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm4[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm8 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm4[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm13[2,3],ymm11[2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 48(%r9), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] @@ -1242,7 +1240,7 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] @@ -1255,7 +1253,7 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[2,2,3,3] @@ -1267,7 +1265,7 @@ ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 160(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 256(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm13, 352(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 352(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1287,142 +1285,142 @@ ; AVX2-FAST-ALL-LABEL: store_i32_stride6_vf16: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: subq $184, %rsp -; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm9 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm7 -; AVX2-FAST-ALL-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX2-FAST-ALL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,2,2] -; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-ALL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,2,3] +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm5 +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,1,2,3] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm3 +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm15[0],zero,xmm15[1],zero +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpbroadcastd 36(%r9), %ymm4 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,2,2] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,3] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpbroadcastd 36(%r9), %ymm6 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,1,2,2] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2],xmm6[3] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm5[0],zero,xmm5[1],zero -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5],ymm4[6,7] +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] ; AVX2-FAST-ALL-NEXT: vpbroadcastd 4(%r9), %ymm4 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpbroadcastd (%rcx), %xmm3 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vpbroadcastd (%rcx), %xmm2 ; AVX2-FAST-ALL-NEXT: vpbroadcastd (%rdx), %xmm4 -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %ymm13 -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm5, %ymm2 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-ALL-NEXT: vpbroadcastd (%r9), %ymm2 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[1,1,2,3,5,5,6,7] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdx), %ymm12 +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm3, %ymm1 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-ALL-NEXT: vpbroadcastd (%r9), %ymm1 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,1,2,2,4,5,6,6] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,1,2,3,5,5,6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] -; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpbroadcastd 52(%r9), %ymm3 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpbroadcastd 32(%rcx), %xmm1 -; AVX2-FAST-ALL-NEXT: vpbroadcastd 32(%rdx), %xmm3 -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpbroadcastd 52(%r9), %ymm1 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vpbroadcastd 32(%rcx), %xmm0 +; AVX2-FAST-ALL-NEXT: vpbroadcastd 32(%rdx), %xmm1 +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm15, %ymm1 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-ALL-NEXT: vpbroadcastd 32(%r9), %ymm1 ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm15 ; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[1,1,2,3,5,5,6,7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[1,1,2,3,5,5,6,7] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm10 = ymm1[2,1,2,3] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm11 = ymm1[2,1,2,3] ; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm3 ; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm1[4,5],ymm10[6,7] -; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpbroadcastd 20(%r9), %ymm15 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm15[3],ymm10[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] -; AVX2-FAST-ALL-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm15[2,3] -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-ALL-NEXT: vpermd %ymm15, %ymm9, %ymm12 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3,4,5],ymm12[6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = [4,6,2,3,4,6,6,7] -; AVX2-FAST-ALL-NEXT: vpermd %ymm12, %ymm8, %ymm10 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3,4,5,6],ymm10[7] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm1[4,5],ymm11[6,7] +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} xmm10 = mem[0],zero,mem[1],zero +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vpbroadcastd 20(%r9), %ymm11 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-FAST-ALL-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm14[2,3],ymm10[2,3] +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm14, %ymm7, %ymm9 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5],ymm9[6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa 32(%r9), %ymm10 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,2,3,4,6,6,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm6, %ymm11 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5,6],ymm11[7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] ; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm10[2,2,2,2] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,2,2,2] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] ; AVX2-FAST-ALL-NEXT: vpbroadcastd 48(%r9), %ymm4 ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-ALL-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-ALL-NEXT: # xmm4 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX2-FAST-ALL-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-ALL-NEXT: # xmm4 = xmm8[2],mem[2],xmm8[3],mem[3] ; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm4, %ymm10, %ymm4 -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-ALL-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm15, %ymm10, %ymm11 +; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-ALL-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm14, %ymm8, %ymm11 ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3,4,5],ymm11[6,7] -; AVX2-FAST-ALL-NEXT: vpermd %ymm12, %ymm10, %ymm11 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4,5,6],ymm11[7] -; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[6],ymm0[6],ymm7[7],ymm0[7] -; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-FAST-ALL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm11[2,3] -; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %ymm11 -; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm9, %ymm9 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5],ymm9[6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %ymm9 -; AVX2-FAST-ALL-NEXT: vpermd %ymm9, %ymm8, %ymm8 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6],ymm8[7] -; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5] +; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm8, %ymm10 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2,3,4,5,6],ymm10[7] +; AVX2-FAST-ALL-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[6],ymm0[6],ymm15[7],ymm0[7] +; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-FAST-ALL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm10[2,3] +; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %ymm10 +; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm7, %ymm7 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3,4,5],ymm7[6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa (%r9), %ymm7 +; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm6, %ymm6 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] +; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] ; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5],ymm0[6,7] ; AVX2-FAST-ALL-NEXT: vpbroadcastd 16(%r9), %ymm3 ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -1431,9 +1429,9 @@ ; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX2-FAST-ALL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 -; AVX2-FAST-ALL-NEXT: vpermd %ymm11, %ymm10, %ymm5 +; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm8, %ymm5 ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5],ymm5[6,7] -; AVX2-FAST-ALL-NEXT: vpermd %ymm9, %ymm10, %ymm5 +; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm8, %ymm5 ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6],ymm5[7] ; AVX2-FAST-ALL-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, 64(%rax) @@ -1441,7 +1439,7 @@ ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 160(%rax) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm4, 256(%rax) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, 288(%rax) -; AVX2-FAST-ALL-NEXT: vmovdqa %ymm6, 352(%rax) +; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, 352(%rax) ; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1462,127 +1460,127 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm12[0],zero,xmm12[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm12[0],zero,xmm12[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%r9), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%r9), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 52(%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 52(%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm12, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm6, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4],ymm4[5],ymm8[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm4[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm4[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm13[2,3],ymm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = mem[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 48(%r9), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] @@ -1596,7 +1594,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] @@ -1609,7 +1607,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[2,2,3,3] @@ -1621,7 +1619,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 256(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll @@ -136,9 +136,9 @@ ; SSE-NEXT: movaps (%rsi), %xmm4 ; SSE-NEXT: movaps 16(%rsi), %xmm5 ; SSE-NEXT: movaps 32(%rsi), %xmm6 -; SSE-NEXT: movaps 48(%rsi), %xmm8 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm4[1] +; SSE-NEXT: movaps 48(%rsi), %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] @@ -147,8 +147,8 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] ; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] ; SSE-NEXT: movaps %xmm3, 96(%rdx) ; SSE-NEXT: movaps %xmm6, 112(%rdx) ; SSE-NEXT: movaps %xmm2, 64(%rdx) @@ -156,7 +156,7 @@ ; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps %xmm4, 48(%rdx) ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm7, 16(%rdx) +; SSE-NEXT: movaps %xmm8, 16(%rdx) ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i64_stride2_vf8: @@ -235,64 +235,62 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { ; SSE-LABEL: store_i64_stride2_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movaps 112(%rdi), %xmm4 +; SSE-NEXT: movaps 112(%rdi), %xmm0 ; SSE-NEXT: movaps 96(%rdi), %xmm6 -; SSE-NEXT: movaps 80(%rdi), %xmm8 -; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps 16(%rdi), %xmm14 -; SSE-NEXT: movaps 32(%rdi), %xmm15 +; SSE-NEXT: movaps 80(%rdi), %xmm4 +; SSE-NEXT: movaps 64(%rdi), %xmm3 +; SSE-NEXT: movaps (%rdi), %xmm8 +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 48(%rdi), %xmm5 -; SSE-NEXT: movaps 96(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rsi), %xmm11 ; SSE-NEXT: movaps 80(%rsi), %xmm12 ; SSE-NEXT: movaps 64(%rsi), %xmm13 -; SSE-NEXT: movaps (%rsi), %xmm2 -; SSE-NEXT: movaps 16(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rsi), %xmm0 -; SSE-NEXT: movaps 48(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] +; SSE-NEXT: movaps (%rsi), %xmm9 +; SSE-NEXT: movaps 16(%rsi), %xmm10 +; SSE-NEXT: movaps 32(%rsi), %xmm14 +; SSE-NEXT: movaps 48(%rsi), %xmm15 +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm1[0] -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm13[0] -; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0] +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm14[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0] +; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0] +; SSE-NEXT: movaps %xmm3, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm13[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0] +; SSE-NEXT: movaps %xmm4, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm12[0] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; SSE-NEXT: movaps 112(%rsi), %xmm12 -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] -; SSE-NEXT: movaps %xmm4, 224(%rdx) +; SSE-NEXT: movaps %xmm6, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0] +; SSE-NEXT: movaps 112(%rsi), %xmm11 +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] +; SSE-NEXT: movaps %xmm0, 224(%rdx) ; SSE-NEXT: movaps %xmm7, 240(%rdx) ; SSE-NEXT: movaps %xmm6, 192(%rdx) -; SSE-NEXT: movaps %xmm3, 208(%rdx) -; SSE-NEXT: movaps %xmm8, 160(%rdx) +; SSE-NEXT: movaps %xmm12, 208(%rdx) +; SSE-NEXT: movaps %xmm4, 160(%rdx) ; SSE-NEXT: movaps %xmm13, 176(%rdx) -; SSE-NEXT: movaps %xmm9, 128(%rdx) -; SSE-NEXT: movaps %xmm1, 144(%rdx) +; SSE-NEXT: movaps %xmm3, 128(%rdx) +; SSE-NEXT: movaps %xmm15, 144(%rdx) ; SSE-NEXT: movaps %xmm5, 96(%rdx) -; SSE-NEXT: movaps %xmm0, 112(%rdx) -; SSE-NEXT: movaps %xmm15, 64(%rdx) -; SSE-NEXT: movaps %xmm2, 80(%rdx) -; SSE-NEXT: movaps %xmm14, 32(%rdx) -; SSE-NEXT: movaps %xmm10, 48(%rdx) -; SSE-NEXT: movaps %xmm11, (%rdx) +; SSE-NEXT: movaps %xmm14, 112(%rdx) +; SSE-NEXT: movaps %xmm2, 64(%rdx) +; SSE-NEXT: movaps %xmm10, 80(%rdx) +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps %xmm9, 48(%rdx) +; SSE-NEXT: movaps %xmm8, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: retq @@ -300,25 +298,25 @@ ; AVX1-LABEL: store_i64_stride2_vf16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovaps (%rsi), %xmm0 -; AVX1-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX1-NEXT: vmovaps 32(%rsi), %xmm1 ; AVX1-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX1-NEXT: vmovaps 96(%rsi), %xmm3 ; AVX1-NEXT: vmovaps (%rdi), %xmm4 ; AVX1-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX1-NEXT: vmovaps 64(%rdi), %xmm6 ; AVX1-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm3[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm3[1] ; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm2[1] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm2[1] ; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm0[1] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm4[1],xmm0[1] ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm8[1] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm8[0] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm1[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3] ; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[3],ymm4[3] @@ -335,10 +333,10 @@ ; AVX1-NEXT: vmovapd %ymm6, 96(%rdx) ; AVX1-NEXT: vmovapd %ymm5, 32(%rdx) ; AVX1-NEXT: vmovapd %ymm4, 160(%rdx) -; AVX1-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX1-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX1-NEXT: vmovapd %ymm0, (%rdx) ; AVX1-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX1-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX1-NEXT: vmovaps %ymm3, 192(%rdx) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -423,53 +421,53 @@ ; SSE-NEXT: subq $152, %rsp ; SSE-NEXT: movaps 112(%rdi), %xmm14 ; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps 80(%rdi), %xmm11 -; SSE-NEXT: movaps 64(%rdi), %xmm10 -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm9 -; SSE-NEXT: movaps 48(%rdi), %xmm12 -; SSE-NEXT: movaps 96(%rsi), %xmm15 +; SSE-NEXT: movaps 80(%rdi), %xmm10 +; SSE-NEXT: movaps 64(%rdi), %xmm9 +; SSE-NEXT: movaps (%rdi), %xmm6 +; SSE-NEXT: movaps 16(%rdi), %xmm7 +; SSE-NEXT: movaps 32(%rdi), %xmm8 +; SSE-NEXT: movaps 48(%rdi), %xmm11 +; SSE-NEXT: movaps 96(%rsi), %xmm0 ; SSE-NEXT: movaps 80(%rsi), %xmm1 ; SSE-NEXT: movaps 64(%rsi), %xmm2 ; SSE-NEXT: movaps (%rsi), %xmm3 ; SSE-NEXT: movaps 16(%rsi), %xmm4 ; SSE-NEXT: movaps 32(%rsi), %xmm5 -; SSE-NEXT: movaps 48(%rsi), %xmm6 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0] +; SSE-NEXT: movaps 48(%rsi), %xmm12 +; SSE-NEXT: movaps %xmm6, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm4[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm5[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm5[1] +; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm15[1] +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm14, %xmm1 @@ -577,48 +575,48 @@ ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps 128(%rsi), %xmm1 ; AVX1-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1] ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm9 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps (%rsi), %xmm2 ; AVX1-NEXT: vmovaps 32(%rsi), %xmm3 ; AVX1-NEXT: vmovaps 64(%rsi), %xmm4 -; AVX1-NEXT: vmovaps 96(%rsi), %xmm10 +; AVX1-NEXT: vmovaps 96(%rsi), %xmm5 ; AVX1-NEXT: vmovaps (%rdi), %xmm6 ; AVX1-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX1-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm2[1] +; AVX1-NEXT: vmovaps 64(%rdi), %xmm8 +; AVX1-NEXT: vmovaps 96(%rdi), %xmm9 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm6[1],xmm2[1] ; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm3[1] +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm3[1] ; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm0[1],xmm4[1] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm4 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm10[1] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm5 -; AVX1-NEXT: vmovaps 160(%rsi), %xmm0 -; AVX1-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm1[1],xmm0[1] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-NEXT: vmovapd 192(%rsi), %xmm0 -; AVX1-NEXT: vmovapd 192(%rdi), %xmm1 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm1[1],xmm0[1] -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] -; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm4[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm4[0] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm9[1],xmm5[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm5[0] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-NEXT: vmovaps 160(%rsi), %xmm6 +; AVX1-NEXT: vmovaps 160(%rdi), %xmm7 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm7[0],xmm6[0] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-NEXT: vmovaps 192(%rsi), %xmm7 +; AVX1-NEXT: vmovaps 192(%rdi), %xmm8 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[3],ymm8[3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3] -; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[3],ymm1[3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[3],ymm9[3] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3] ; AVX1-NEXT: vshufpd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[3],ymm10[3] @@ -635,23 +633,23 @@ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3] ; AVX1-NEXT: vshufpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[3],ymm14[3] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] -; AVX1-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm15[0],ymm8[3],ymm15[3] -; AVX1-NEXT: vmovapd %ymm8, 480(%rdx) +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[3],ymm15[3] +; AVX1-NEXT: vmovapd %ymm0, 480(%rdx) ; AVX1-NEXT: vmovapd %ymm14, 416(%rdx) ; AVX1-NEXT: vmovapd %ymm13, 352(%rdx) ; AVX1-NEXT: vmovapd %ymm12, 224(%rdx) ; AVX1-NEXT: vmovapd %ymm11, 160(%rdx) ; AVX1-NEXT: vmovapd %ymm10, 96(%rdx) -; AVX1-NEXT: vmovapd %ymm1, 32(%rdx) -; AVX1-NEXT: vmovapd %ymm0, 288(%rdx) +; AVX1-NEXT: vmovapd %ymm9, 32(%rdx) +; AVX1-NEXT: vmovapd %ymm8, 288(%rdx) ; AVX1-NEXT: vmovaps %ymm7, 384(%rdx) ; AVX1-NEXT: vmovaps %ymm6, 320(%rdx) ; AVX1-NEXT: vmovaps %ymm5, 192(%rdx) ; AVX1-NEXT: vmovaps %ymm4, 128(%rdx) ; AVX1-NEXT: vmovaps %ymm3, 64(%rdx) ; AVX1-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-NEXT: vmovaps %ymm9, 256(%rdx) +; AVX1-NEXT: vmovaps %ymm1, 256(%rdx) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 448(%rdx) ; AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll @@ -181,44 +181,44 @@ ; SSE: # %bb.0: ; SSE-NEXT: movaps (%rdi), %xmm3 ; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm13 -; SSE-NEXT: movaps 48(%rdi), %xmm12 -; SSE-NEXT: movaps (%rsi), %xmm8 -; SSE-NEXT: movaps 16(%rsi), %xmm9 -; SSE-NEXT: movaps 32(%rsi), %xmm11 -; SSE-NEXT: movaps 48(%rsi), %xmm4 -; SSE-NEXT: movaps (%rdx), %xmm7 -; SSE-NEXT: movaps 16(%rdx), %xmm0 +; SSE-NEXT: movaps 32(%rdi), %xmm1 +; SSE-NEXT: movaps 48(%rdi), %xmm0 +; SSE-NEXT: movaps (%rsi), %xmm7 +; SSE-NEXT: movaps 16(%rsi), %xmm8 +; SSE-NEXT: movaps 32(%rsi), %xmm9 +; SSE-NEXT: movaps 48(%rsi), %xmm10 +; SSE-NEXT: movaps (%rdx), %xmm11 +; SSE-NEXT: movaps 16(%rdx), %xmm12 ; SSE-NEXT: movaps 32(%rdx), %xmm6 ; SSE-NEXT: movaps 48(%rdx), %xmm5 -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm4[0] -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] -; SSE-NEXT: movaps %xmm8, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm8[0] +; SSE-NEXT: movaps %xmm10, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm8[0] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm3[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] ; SSE-NEXT: movaps %xmm3, (%rcx) -; SSE-NEXT: movaps %xmm7, 16(%rcx) -; SSE-NEXT: movaps %xmm4, 32(%rcx) +; SSE-NEXT: movaps %xmm11, 16(%rcx) +; SSE-NEXT: movaps %xmm8, 32(%rcx) ; SSE-NEXT: movaps %xmm2, 48(%rcx) -; SSE-NEXT: movaps %xmm0, 64(%rcx) -; SSE-NEXT: movaps %xmm1, 80(%rcx) -; SSE-NEXT: movaps %xmm13, 96(%rcx) +; SSE-NEXT: movaps %xmm12, 64(%rcx) +; SSE-NEXT: movaps %xmm9, 80(%rcx) +; SSE-NEXT: movaps %xmm1, 96(%rcx) ; SSE-NEXT: movaps %xmm6, 112(%rcx) -; SSE-NEXT: movaps %xmm14, 128(%rcx) -; SSE-NEXT: movaps %xmm12, 144(%rcx) +; SSE-NEXT: movaps %xmm10, 128(%rcx) +; SSE-NEXT: movaps %xmm0, 144(%rcx) ; SSE-NEXT: movaps %xmm5, 160(%rcx) -; SSE-NEXT: movaps %xmm10, 176(%rcx) +; SSE-NEXT: movaps %xmm4, 176(%rcx) ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i64_stride3_vf8: @@ -343,98 +343,99 @@ ; SSE-LABEL: store_i64_stride3_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movapd 64(%rdi), %xmm9 -; SSE-NEXT: movapd (%rdi), %xmm3 -; SSE-NEXT: movapd 16(%rdi), %xmm13 -; SSE-NEXT: movapd 32(%rdi), %xmm8 -; SSE-NEXT: movapd 48(%rdi), %xmm10 -; SSE-NEXT: movapd 64(%rsi), %xmm12 -; SSE-NEXT: movapd (%rsi), %xmm7 -; SSE-NEXT: movapd 16(%rsi), %xmm14 -; SSE-NEXT: movapd 32(%rsi), %xmm15 -; SSE-NEXT: movapd 48(%rsi), %xmm11 -; SSE-NEXT: movapd 64(%rdx), %xmm6 -; SSE-NEXT: movapd (%rdx), %xmm2 -; SSE-NEXT: movapd 16(%rdx), %xmm4 -; SSE-NEXT: movapd 32(%rdx), %xmm5 -; SSE-NEXT: movapd 48(%rdx), %xmm0 -; SSE-NEXT: movapd %xmm3, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm7[0] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE-NEXT: movapd 64(%rdi), %xmm4 +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: movapd 16(%rdi), %xmm1 +; SSE-NEXT: movapd 32(%rdi), %xmm2 +; SSE-NEXT: movapd 48(%rdi), %xmm5 +; SSE-NEXT: movapd 64(%rsi), %xmm9 +; SSE-NEXT: movapd (%rsi), %xmm3 +; SSE-NEXT: movapd 16(%rsi), %xmm6 +; SSE-NEXT: movapd 32(%rsi), %xmm7 +; SSE-NEXT: movapd 48(%rsi), %xmm10 +; SSE-NEXT: movapd 64(%rdx), %xmm15 +; SSE-NEXT: movapd (%rdx), %xmm11 +; SSE-NEXT: movapd 16(%rdx), %xmm12 +; SSE-NEXT: movapd 32(%rdx), %xmm13 +; SSE-NEXT: movapd 48(%rdx), %xmm14 +; SSE-NEXT: movapd %xmm0, %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm3[0] +; SSE-NEXT: movapd %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm11 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm6[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm13[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm14[0] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm4[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm4[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm8, %xmm13 -; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm15[0] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm5[1] -; SSE-NEXT: movapd %xmm10, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm11[0] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movapd %xmm9, %xmm14 -; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm12[0] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] -; SSE-NEXT: movapd 80(%rdi), %xmm8 +; SSE-NEXT: movapd %xmm5, %xmm13 +; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm10[0] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm14[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm14[1] +; SSE-NEXT: movapd %xmm4, %xmm14 +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm9[0] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm15[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] +; SSE-NEXT: movapd 80(%rdi), %xmm15 ; SSE-NEXT: movapd 80(%rsi), %xmm6 -; SSE-NEXT: movapd %xmm8, %xmm9 -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm6[0] +; SSE-NEXT: movapd %xmm15, %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm6[0] ; SSE-NEXT: movapd 80(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movapd 96(%rdi), %xmm5 +; SSE-NEXT: movapd 96(%rdi), %xmm4 ; SSE-NEXT: movapd 96(%rsi), %xmm1 -; SSE-NEXT: movapd %xmm5, %xmm7 +; SSE-NEXT: movapd %xmm4, %xmm7 ; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: movapd 96(%rdx), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movapd 112(%rdi), %xmm2 ; SSE-NEXT: movapd 112(%rsi), %xmm0 ; SSE-NEXT: movapd %xmm2, %xmm3 ; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movapd 112(%rdx), %xmm4 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: movapd 112(%rdx), %xmm5 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: movapd %xmm0, 368(%rcx) ; SSE-NEXT: movapd %xmm2, 352(%rcx) ; SSE-NEXT: movapd %xmm3, 336(%rcx) ; SSE-NEXT: movapd %xmm1, 320(%rcx) -; SSE-NEXT: movapd %xmm5, 304(%rcx) +; SSE-NEXT: movapd %xmm4, 304(%rcx) ; SSE-NEXT: movapd %xmm7, 288(%rcx) ; SSE-NEXT: movapd %xmm6, 272(%rcx) -; SSE-NEXT: movapd %xmm8, 256(%rcx) -; SSE-NEXT: movapd %xmm9, 240(%rcx) -; SSE-NEXT: movapd %xmm12, 224(%rcx) +; SSE-NEXT: movapd %xmm15, 256(%rcx) +; SSE-NEXT: movapd %xmm8, 240(%rcx) +; SSE-NEXT: movapd %xmm9, 224(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rcx) ; SSE-NEXT: movapd %xmm14, 192(%rcx) -; SSE-NEXT: movapd %xmm11, 176(%rcx) +; SSE-NEXT: movapd %xmm10, 176(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rcx) -; SSE-NEXT: movapd %xmm10, 144(%rcx) -; SSE-NEXT: movapd %xmm15, 128(%rcx) +; SSE-NEXT: movapd %xmm13, 144(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rcx) -; SSE-NEXT: movapd %xmm13, 96(%rcx) +; SSE-NEXT: movapd %xmm12, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movapd %xmm11, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll @@ -88,31 +88,31 @@ ; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps (%rsi), %xmm2 -; SSE-NEXT: movaps 16(%rsi), %xmm8 +; SSE-NEXT: movaps 16(%rsi), %xmm3 ; SSE-NEXT: movaps (%rdx), %xmm4 ; SSE-NEXT: movaps 16(%rdx), %xmm5 ; SSE-NEXT: movaps (%rcx), %xmm6 -; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; SSE-NEXT: movaps 16(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0] +; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] ; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: movaps %xmm1, 96(%r8) ; SSE-NEXT: movaps %xmm5, 112(%r8) ; SSE-NEXT: movaps %xmm6, 64(%r8) ; SSE-NEXT: movaps %xmm2, 80(%r8) ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps %xmm4, 48(%r8) -; SSE-NEXT: movaps %xmm7, (%r8) -; SSE-NEXT: movaps %xmm3, 16(%r8) +; SSE-NEXT: movaps %xmm9, (%r8) +; SSE-NEXT: movaps %xmm8, 16(%r8) ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i64_stride4_vf4: @@ -193,65 +193,62 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind { ; SSE-LABEL: store_i64_stride4_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm13 -; SSE-NEXT: movaps 32(%rdi), %xmm9 +; SSE-NEXT: movaps (%rdi), %xmm5 +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps 32(%rdi), %xmm3 ; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm6 +; SSE-NEXT: movaps (%rsi), %xmm10 ; SSE-NEXT: movaps 16(%rsi), %xmm12 -; SSE-NEXT: movaps 32(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rdx), %xmm14 +; SSE-NEXT: movaps 32(%rsi), %xmm11 +; SSE-NEXT: movaps (%rdx), %xmm2 ; SSE-NEXT: movaps 16(%rdx), %xmm4 ; SSE-NEXT: movaps 32(%rdx), %xmm7 -; SSE-NEXT: movaps 48(%rdx), %xmm5 -; SSE-NEXT: movaps (%rcx), %xmm1 -; SSE-NEXT: movaps 16(%rcx), %xmm2 -; SSE-NEXT: movaps 32(%rcx), %xmm15 -; SSE-NEXT: movaps 48(%rcx), %xmm11 -; SSE-NEXT: movaps %xmm14, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1] +; SSE-NEXT: movaps 48(%rdx), %xmm9 +; SSE-NEXT: movaps (%rcx), %xmm8 +; SSE-NEXT: movaps 16(%rcx), %xmm13 +; SSE-NEXT: movaps 32(%rcx), %xmm14 +; SSE-NEXT: movaps 48(%rcx), %xmm15 +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1] +; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] +; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] ; SSE-NEXT: movaps %xmm7, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] -; SSE-NEXT: movaps %xmm9, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1] -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1] -; SSE-NEXT: movaps 48(%rsi), %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] +; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] +; SSE-NEXT: movaps 48(%rsi), %xmm15 ; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] ; SSE-NEXT: movaps %xmm0, 224(%r8) -; SSE-NEXT: movaps %xmm5, 240(%r8) +; SSE-NEXT: movaps %xmm9, 240(%r8) ; SSE-NEXT: movaps %xmm6, 192(%r8) -; SSE-NEXT: movaps %xmm2, 208(%r8) -; SSE-NEXT: movaps %xmm9, 160(%r8) +; SSE-NEXT: movaps %xmm11, 208(%r8) +; SSE-NEXT: movaps %xmm3, 160(%r8) ; SSE-NEXT: movaps %xmm7, 176(%r8) -; SSE-NEXT: movaps %xmm15, 128(%r8) +; SSE-NEXT: movaps %xmm14, 128(%r8) ; SSE-NEXT: movaps %xmm12, 144(%r8) -; SSE-NEXT: movaps %xmm13, 96(%r8) +; SSE-NEXT: movaps %xmm1, 96(%r8) ; SSE-NEXT: movaps %xmm4, 112(%r8) -; SSE-NEXT: movaps %xmm1, 64(%r8) -; SSE-NEXT: movaps %xmm8, 80(%r8) -; SSE-NEXT: movaps %xmm3, 32(%r8) -; SSE-NEXT: movaps %xmm14, 48(%r8) -; SSE-NEXT: movaps %xmm10, (%r8) +; SSE-NEXT: movaps %xmm13, 64(%r8) +; SSE-NEXT: movaps %xmm10, 80(%r8) +; SSE-NEXT: movaps %xmm5, 32(%r8) +; SSE-NEXT: movaps %xmm2, 48(%r8) +; SSE-NEXT: movaps %xmm8, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: retq @@ -266,46 +263,46 @@ ; AVX1-NEXT: vmovaps 48(%rsi), %xmm5 ; AVX1-NEXT: vmovaps 48(%rdi), %xmm6 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm5[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX1-NEXT: vmovaps 16(%rsi), %xmm5 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm6 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm5[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: vmovaps 32(%rsi), %xmm3 ; AVX1-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm5[0],xmm3[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm3[0] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm3[1] ; AVX1-NEXT: vmovaps (%rsi), %xmm5 ; AVX1-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm5[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm5[0] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm5[1] ; AVX1-NEXT: vmovaps 32(%rcx), %xmm7 -; AVX1-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm7[0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; AVX1-NEXT: vmovaps (%rcx), %xmm7 -; AVX1-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm7[0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; AVX1-NEXT: vmovaps %xmm2, 48(%r8) -; AVX1-NEXT: vmovaps %xmm6, 16(%r8) -; AVX1-NEXT: vmovaps %xmm1, 176(%r8) -; AVX1-NEXT: vmovaps %xmm4, 144(%r8) +; AVX1-NEXT: vmovaps 32(%rdx), %xmm9 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm7[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm7[1] +; AVX1-NEXT: vmovaps (%rcx), %xmm9 +; AVX1-NEXT: vmovaps (%rdx), %xmm11 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm9[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm11[1],xmm9[1] +; AVX1-NEXT: vmovaps %xmm9, 48(%r8) +; AVX1-NEXT: vmovaps %xmm12, 16(%r8) +; AVX1-NEXT: vmovaps %xmm7, 176(%r8) +; AVX1-NEXT: vmovaps %xmm10, 144(%r8) ; AVX1-NEXT: vmovaps %xmm5, 32(%r8) -; AVX1-NEXT: vmovaps %xmm0, (%r8) +; AVX1-NEXT: vmovaps %xmm8, (%r8) ; AVX1-NEXT: vmovaps %xmm3, 160(%r8) -; AVX1-NEXT: vmovaps %xmm12, 128(%r8) -; AVX1-NEXT: vmovaps %ymm11, 96(%r8) -; AVX1-NEXT: vmovaps %ymm10, 64(%r8) -; AVX1-NEXT: vmovaps %ymm9, 224(%r8) -; AVX1-NEXT: vmovaps %ymm8, 192(%r8) +; AVX1-NEXT: vmovaps %xmm6, 128(%r8) +; AVX1-NEXT: vmovaps %ymm2, 96(%r8) +; AVX1-NEXT: vmovaps %ymm4, 64(%r8) +; AVX1-NEXT: vmovaps %ymm1, 224(%r8) +; AVX1-NEXT: vmovaps %ymm0, 192(%r8) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -321,44 +318,44 @@ ; AVX2-NEXT: vmovaps 32(%rcx), %ymm8 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm6[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm6[2,3],ymm4[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3],ymm5[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm5[2,3] ; AVX2-NEXT: vmovaps (%rsi), %xmm3 ; AVX2-NEXT: vmovaps 32(%rsi), %xmm5 ; AVX2-NEXT: vmovaps (%rdi), %xmm6 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm12 = xmm7[0],xmm5[0] -; AVX2-NEXT: vmovaps (%rcx), %xmm1 -; AVX2-NEXT: vmovaps 32(%rcx), %xmm4 -; AVX2-NEXT: vmovaps (%rdx), %xmm2 -; AVX2-NEXT: vmovaps 32(%rdx), %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm4[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm5[0] +; AVX2-NEXT: vmovaps (%rcx), %xmm9 +; AVX2-NEXT: vmovaps 32(%rcx), %xmm10 +; AVX2-NEXT: vmovaps (%rdx), %xmm11 +; AVX2-NEXT: vmovaps 32(%rdx), %xmm12 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm10[0] ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm5[1] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX2-NEXT: vmovlhps {{.*#+}} xmm4 = xmm6[0],xmm3[0] -; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm2[0],xmm1[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm12[1],xmm10[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm6[0],xmm3[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm9[0] ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX2-NEXT: vmovaps %xmm1, 48(%r8) +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm11[1],xmm9[1] +; AVX2-NEXT: vmovaps %xmm6, 48(%r8) ; AVX2-NEXT: vmovaps %xmm3, 32(%r8) -; AVX2-NEXT: vmovaps %xmm7, 16(%r8) -; AVX2-NEXT: vmovaps %xmm4, (%r8) -; AVX2-NEXT: vmovaps %xmm0, 176(%r8) +; AVX2-NEXT: vmovaps %xmm12, 16(%r8) +; AVX2-NEXT: vmovaps %xmm10, (%r8) +; AVX2-NEXT: vmovaps %xmm7, 176(%r8) ; AVX2-NEXT: vmovaps %xmm5, 160(%r8) ; AVX2-NEXT: vmovaps %xmm13, 144(%r8) -; AVX2-NEXT: vmovaps %xmm12, 128(%r8) -; AVX2-NEXT: vmovaps %ymm11, 96(%r8) -; AVX2-NEXT: vmovaps %ymm10, 64(%r8) -; AVX2-NEXT: vmovaps %ymm8, 224(%r8) -; AVX2-NEXT: vmovaps %ymm9, 192(%r8) +; AVX2-NEXT: vmovaps %xmm8, 128(%r8) +; AVX2-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-NEXT: vmovaps %ymm0, 192(%r8) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -415,62 +412,62 @@ ; SSE-LABEL: store_i64_stride4_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm9 -; SSE-NEXT: movaps 48(%rdi), %xmm10 +; SSE-NEXT: movaps (%rdi), %xmm6 +; SSE-NEXT: movaps 16(%rdi), %xmm7 +; SSE-NEXT: movaps 32(%rdi), %xmm8 +; SSE-NEXT: movaps 48(%rdi), %xmm9 ; SSE-NEXT: movaps (%rsi), %xmm1 ; SSE-NEXT: movaps 16(%rsi), %xmm2 -; SSE-NEXT: movaps 32(%rsi), %xmm13 -; SSE-NEXT: movaps 48(%rsi), %xmm11 -; SSE-NEXT: movaps (%rdx), %xmm6 -; SSE-NEXT: movaps 16(%rdx), %xmm12 -; SSE-NEXT: movaps 32(%rdx), %xmm14 -; SSE-NEXT: movaps 48(%rdx), %xmm15 +; SSE-NEXT: movaps 32(%rsi), %xmm0 +; SSE-NEXT: movaps 48(%rsi), %xmm15 +; SSE-NEXT: movaps (%rdx), %xmm10 +; SSE-NEXT: movaps 16(%rdx), %xmm11 +; SSE-NEXT: movaps 32(%rdx), %xmm13 +; SSE-NEXT: movaps 48(%rdx), %xmm14 ; SSE-NEXT: movaps (%rcx), %xmm3 ; SSE-NEXT: movaps 16(%rcx), %xmm4 ; SSE-NEXT: movaps 32(%rcx), %xmm5 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-NEXT: movaps %xmm6, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] +; SSE-NEXT: movaps %xmm10, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm13[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm5[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm15 ; SSE-NEXT: movaps 64(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm15, %xmm0 @@ -567,87 +564,87 @@ ; AVX1-LABEL: store_i64_stride4_vf16: ; AVX1: # %bb.0: ; AVX1-NEXT: subq $152, %rsp -; AVX1-NEXT: vmovaps 96(%rdx), %ymm12 -; AVX1-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX1-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX1-NEXT: vmovaps (%rdx), %ymm3 -; AVX1-NEXT: vmovaps 96(%rcx), %ymm13 -; AVX1-NEXT: vmovaps 64(%rcx), %ymm5 +; AVX1-NEXT: vmovaps 96(%rdx), %ymm7 +; AVX1-NEXT: vmovaps 64(%rdx), %ymm5 +; AVX1-NEXT: vmovaps 32(%rdx), %ymm3 +; AVX1-NEXT: vmovaps (%rdx), %ymm1 +; AVX1-NEXT: vmovaps 96(%rcx), %ymm8 +; AVX1-NEXT: vmovaps 64(%rcx), %ymm6 ; AVX1-NEXT: vmovaps 32(%rcx), %ymm4 -; AVX1-NEXT: vmovaps (%rcx), %ymm6 -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX1-NEXT: vmovaps 16(%rsi), %xmm0 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm7[1],xmm0[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm0[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: vmovaps (%rcx), %ymm2 +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX1-NEXT: vmovaps 16(%rsi), %xmm9 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm10 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] -; AVX1-NEXT: vmovaps 48(%rsi), %xmm3 -; AVX1-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm3[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm10[0],xmm9[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-NEXT: vmovaps 48(%rsi), %xmm9 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm10 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm10[0],xmm9[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] -; AVX1-NEXT: vmovaps 80(%rsi), %xmm2 -; AVX1-NEXT: vmovaps 80(%rdi), %xmm3 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm2[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX1-NEXT: vmovaps 80(%rsi), %xmm9 +; AVX1-NEXT: vmovaps 80(%rdi), %xmm10 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm4[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm10[0],xmm9[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX1-NEXT: vmovaps 112(%rsi), %xmm1 -; AVX1-NEXT: vmovaps 112(%rdi), %xmm2 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-NEXT: vmovaps 112(%rsi), %xmm9 +; AVX1-NEXT: vmovaps 112(%rdi), %xmm10 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm6[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm10[0],xmm9[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps 64(%rsi), %xmm2 -; AVX1-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] +; AVX1-NEXT: vmovaps 64(%rsi), %xmm10 +; AVX1-NEXT: vmovaps 64(%rdi), %xmm11 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm10[0] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 32(%rcx), %xmm4 -; AVX1-NEXT: vmovaps 64(%rcx), %xmm5 -; AVX1-NEXT: vmovaps 64(%rdx), %xmm7 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm7[0],xmm5[0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm3[1],xmm2[1] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm7[1],xmm5[1] -; AVX1-NEXT: vmovaps 32(%rsi), %xmm5 -; AVX1-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm7[0],xmm5[0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm7[1],xmm5[1] -; AVX1-NEXT: vmovaps 32(%rdx), %xmm7 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm11 = xmm7[0],xmm4[0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm7[1],xmm4[1] -; AVX1-NEXT: vmovaps 96(%rsi), %xmm7 +; AVX1-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX1-NEXT: vmovaps 64(%rcx), %xmm13 +; AVX1-NEXT: vmovaps 64(%rdx), %xmm14 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm13[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1] +; AVX1-NEXT: vmovaps 32(%rsi), %xmm13 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX1-NEXT: vmovaps 32(%rdx), %xmm14 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm14[0],xmm12[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX1-NEXT: vmovaps 96(%rsi), %xmm14 ; AVX1-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm7[0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm7[1] -; AVX1-NEXT: vmovaps 96(%rcx), %xmm7 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm14[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm14[1] +; AVX1-NEXT: vmovaps 96(%rcx), %xmm14 ; AVX1-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm7[0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm7[1] -; AVX1-NEXT: vmovaps (%rsi), %xmm7 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm14[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm14[1] +; AVX1-NEXT: vmovaps (%rsi), %xmm14 ; AVX1-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm7[0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm7[1] -; AVX1-NEXT: vmovaps (%rcx), %xmm7 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm14[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm14[1] +; AVX1-NEXT: vmovaps (%rcx), %xmm14 ; AVX1-NEXT: vmovaps (%rdx), %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm7[0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm14[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] ; AVX1-NEXT: vmovaps %xmm0, 48(%r8) ; AVX1-NEXT: vmovaps %xmm2, 32(%r8) ; AVX1-NEXT: vmovaps %xmm1, 16(%r8) @@ -655,14 +652,14 @@ ; AVX1-NEXT: vmovaps %xmm4, 432(%r8) ; AVX1-NEXT: vmovaps %xmm6, 416(%r8) ; AVX1-NEXT: vmovaps %xmm5, 400(%r8) -; AVX1-NEXT: vmovaps %xmm8, 384(%r8) -; AVX1-NEXT: vmovaps %xmm9, 176(%r8) -; AVX1-NEXT: vmovaps %xmm10, 160(%r8) -; AVX1-NEXT: vmovaps %xmm11, 144(%r8) -; AVX1-NEXT: vmovaps %xmm12, 128(%r8) -; AVX1-NEXT: vmovaps %xmm13, 304(%r8) -; AVX1-NEXT: vmovaps %xmm14, 288(%r8) -; AVX1-NEXT: vmovaps %xmm15, 272(%r8) +; AVX1-NEXT: vmovaps %xmm7, 384(%r8) +; AVX1-NEXT: vmovaps %xmm12, 176(%r8) +; AVX1-NEXT: vmovaps %xmm13, 160(%r8) +; AVX1-NEXT: vmovaps %xmm8, 144(%r8) +; AVX1-NEXT: vmovaps %xmm15, 128(%r8) +; AVX1-NEXT: vmovaps %xmm11, 304(%r8) +; AVX1-NEXT: vmovaps %xmm10, 288(%r8) +; AVX1-NEXT: vmovaps %xmm9, 272(%r8) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm0, 256(%r8) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -688,11 +685,11 @@ ; AVX2-LABEL: store_i64_stride4_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $152, %rsp -; AVX2-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm7 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-NEXT: vmovaps 96(%rsi), %ymm7 +; AVX2-NEXT: vmovaps 96(%rsi), %ymm8 ; AVX2-NEXT: vmovaps 64(%rsi), %ymm6 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm4 ; AVX2-NEXT: vmovaps (%rsi), %ymm2 @@ -702,88 +699,88 @@ ; AVX2-NEXT: vmovaps (%rdx), %ymm12 ; AVX2-NEXT: vmovaps 96(%rcx), %ymm13 ; AVX2-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm15[2,3],ymm8[2,3] -; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 64(%rcx), %ymm15 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] ; AVX2-NEXT: vmovaps 32(%rcx), %ymm14 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm12[2,3] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm12[2,3] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm2[2,3] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm2[2,3] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm11[2,3] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm11[2,3] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm4[2,3] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm4[2,3] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm10[2,3] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm10[2,3] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm9[1],ymm13[1],ymm9[3],ymm13[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3],ymm6[2,3] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm6[2,3] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm13[0],ymm9[2],ymm13[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm9[2,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm9[2,3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rsi), %xmm2 -; AVX2-NEXT: vmovaps 64(%rsi), %xmm3 -; AVX2-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm3[0] +; AVX2-NEXT: vmovaps 32(%rsi), %xmm9 +; AVX2-NEXT: vmovaps 64(%rsi), %xmm10 +; AVX2-NEXT: vmovaps 32(%rdi), %xmm11 +; AVX2-NEXT: vmovaps 64(%rdi), %xmm12 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm10[0] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX2-NEXT: vmovaps 64(%rcx), %xmm7 -; AVX2-NEXT: vmovaps 64(%rdx), %xmm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm5[1],xmm3[1] -; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm7[0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm0[1],xmm7[1] -; AVX2-NEXT: vmovlhps {{.*#+}} xmm12 = xmm4[0],xmm2[0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm4[1],xmm2[1] -; AVX2-NEXT: vmovaps 32(%rdx), %xmm4 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm11 = xmm4[0],xmm6[0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm4[1],xmm6[1] -; AVX2-NEXT: vmovaps 96(%rsi), %xmm6 +; AVX2-NEXT: vmovaps 32(%rcx), %xmm13 +; AVX2-NEXT: vmovaps 64(%rcx), %xmm14 +; AVX2-NEXT: vmovaps 64(%rdx), %xmm15 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm12[1],xmm10[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm14[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm14[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm11[0],xmm9[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm11[1],xmm9[1] +; AVX2-NEXT: vmovaps 32(%rdx), %xmm11 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm13[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm13[1] +; AVX2-NEXT: vmovaps 96(%rsi), %xmm13 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm6[0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm0[1],xmm6[1] -; AVX2-NEXT: vmovaps 96(%rcx), %xmm6 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm13[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm13[1] +; AVX2-NEXT: vmovaps 96(%rcx), %xmm13 ; AVX2-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm6[0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm6[1] -; AVX2-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm13[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm13[1] +; AVX2-NEXT: vmovaps (%rsi), %xmm13 ; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm6[0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm6[1] -; AVX2-NEXT: vmovaps (%rcx), %xmm6 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm13[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm13[1] +; AVX2-NEXT: vmovaps (%rcx), %xmm13 ; AVX2-NEXT: vmovaps (%rdx), %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm6[0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm13[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] ; AVX2-NEXT: vmovaps %xmm0, 48(%r8) ; AVX2-NEXT: vmovaps %xmm2, 32(%r8) ; AVX2-NEXT: vmovaps %xmm1, 16(%r8) ; AVX2-NEXT: vmovaps %xmm3, (%r8) ; AVX2-NEXT: vmovaps %xmm4, 432(%r8) -; AVX2-NEXT: vmovaps %xmm7, 416(%r8) +; AVX2-NEXT: vmovaps %xmm6, 416(%r8) ; AVX2-NEXT: vmovaps %xmm5, 400(%r8) -; AVX2-NEXT: vmovaps %xmm8, 384(%r8) -; AVX2-NEXT: vmovaps %xmm9, 176(%r8) -; AVX2-NEXT: vmovaps %xmm10, 160(%r8) -; AVX2-NEXT: vmovaps %xmm11, 144(%r8) -; AVX2-NEXT: vmovaps %xmm12, 128(%r8) -; AVX2-NEXT: vmovaps %xmm13, 304(%r8) -; AVX2-NEXT: vmovaps %xmm14, 288(%r8) -; AVX2-NEXT: vmovaps %xmm15, 272(%r8) +; AVX2-NEXT: vmovaps %xmm7, 384(%r8) +; AVX2-NEXT: vmovaps %xmm11, 176(%r8) +; AVX2-NEXT: vmovaps %xmm9, 160(%r8) +; AVX2-NEXT: vmovaps %xmm8, 144(%r8) +; AVX2-NEXT: vmovaps %xmm15, 128(%r8) +; AVX2-NEXT: vmovaps %xmm14, 304(%r8) +; AVX2-NEXT: vmovaps %xmm10, 288(%r8) +; AVX2-NEXT: vmovaps %xmm12, 272(%r8) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm0, 256(%r8) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -13,24 +13,24 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm8 +; SSE-NEXT: movaps (%rsi), %xmm1 ; SSE-NEXT: movaps (%rdx), %xmm2 ; SSE-NEXT: movaps (%rcx), %xmm3 ; SSE-NEXT: movaps (%r8), %xmm4 ; SSE-NEXT: movaps (%r9), %xmm5 ; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE-NEXT: movaps %xmm4, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: movaps %xmm2, 16(%rax) ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps %xmm4, 80(%rax) -; SSE-NEXT: movaps %xmm1, 64(%rax) +; SSE-NEXT: movaps %xmm8, 64(%rax) ; SSE-NEXT: movaps %xmm7, 32(%rax) ; SSE-NEXT: movaps %xmm6, (%rax) ; SSE-NEXT: retq @@ -122,48 +122,48 @@ ; SSE-LABEL: store_i64_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm15 -; SSE-NEXT: movaps (%rsi), %xmm8 -; SSE-NEXT: movaps 16(%rsi), %xmm10 +; SSE-NEXT: movaps (%rdi), %xmm2 +; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps (%rsi), %xmm5 +; SSE-NEXT: movaps 16(%rsi), %xmm7 ; SSE-NEXT: movaps (%rdx), %xmm6 ; SSE-NEXT: movaps 16(%rdx), %xmm1 -; SSE-NEXT: movaps (%rcx), %xmm9 -; SSE-NEXT: movaps 16(%rcx), %xmm5 -; SSE-NEXT: movaps (%r8), %xmm7 +; SSE-NEXT: movaps (%rcx), %xmm8 +; SSE-NEXT: movaps 16(%rcx), %xmm9 +; SSE-NEXT: movaps (%r8), %xmm10 ; SSE-NEXT: movaps 16(%r8), %xmm4 ; SSE-NEXT: movaps (%r9), %xmm11 -; SSE-NEXT: movaps 16(%r9), %xmm3 -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] +; SSE-NEXT: movaps 16(%r9), %xmm12 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] ; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm5[1] -; SSE-NEXT: movaps %xmm15, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm10[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm10[0] -; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0] -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm9[1] +; SSE-NEXT: movaps %xmm0, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1] +; SSE-NEXT: movaps %xmm6, %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: movaps %xmm6, 16(%rax) -; SSE-NEXT: movaps %xmm7, 32(%rax) -; SSE-NEXT: movaps %xmm2, 48(%rax) -; SSE-NEXT: movaps %xmm5, 64(%rax) -; SSE-NEXT: movaps %xmm3, 80(%rax) -; SSE-NEXT: movaps %xmm15, 96(%rax) +; SSE-NEXT: movaps %xmm10, 32(%rax) +; SSE-NEXT: movaps %xmm12, 48(%rax) +; SSE-NEXT: movaps %xmm9, 64(%rax) +; SSE-NEXT: movaps %xmm7, 80(%rax) +; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps %xmm1, 112(%rax) ; SSE-NEXT: movaps %xmm4, 128(%rax) ; SSE-NEXT: movaps %xmm14, 144(%rax) ; SSE-NEXT: movaps %xmm13, 160(%rax) -; SSE-NEXT: movaps %xmm12, 176(%rax) +; SSE-NEXT: movaps %xmm3, 176(%rax) ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i64_stride6_vf4: @@ -171,18 +171,18 @@ ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-NEXT: vmovapd (%rdi), %ymm0 ; AVX1-NEXT: vmovapd (%rsi), %ymm1 -; AVX1-NEXT: vmovaps (%rdx), %ymm8 +; AVX1-NEXT: vmovaps (%rdx), %ymm2 ; AVX1-NEXT: vmovapd (%r8), %ymm3 ; AVX1-NEXT: vmovapd (%r9), %ymm4 ; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] ; AVX1-NEXT: vmovaps (%rsi), %xmm6 ; AVX1-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3] -; AVX1-NEXT: vmovaps (%rcx), %xmm5 -; AVX1-NEXT: vinsertf128 $1, (%r9), %ymm5, %ymm9 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-NEXT: vblendpd {{.*#+}} ymm8 = ymm3[0],ymm8[1,2,3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3] +; AVX1-NEXT: vmovaps (%rcx), %xmm8 +; AVX1-NEXT: vinsertf128 $1, (%r9), %ymm8, %ymm9 ; AVX1-NEXT: vpermilps {{.*#+}} xmm10 = mem[2,3,2,3] ; AVX1-NEXT: vbroadcastsd 8(%r8), %ymm11 ; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] @@ -191,70 +191,70 @@ ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[3] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] -; AVX1-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-NEXT: vmovapd 16(%rdx), %xmm3 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] -; AVX1-NEXT: vbroadcastsd 24(%r8), %ymm8 -; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm6[0] -; AVX1-NEXT: vmovaps (%rdx), %xmm6 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm5[0] -; AVX1-NEXT: vmovaps %xmm5, 16(%rax) -; AVX1-NEXT: vmovaps %xmm4, (%rax) +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-NEXT: vmovaps 16(%rdi), %xmm2 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vmovapd 16(%rdx), %xmm2 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-NEXT: vbroadcastsd 24(%r8), %ymm3 +; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm6[0] +; AVX1-NEXT: vmovaps (%rdx), %xmm4 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] +; AVX1-NEXT: vmovaps %xmm4, 16(%rax) +; AVX1-NEXT: vmovaps %xmm3, (%rax) ; AVX1-NEXT: vmovaps %ymm1, 96(%rax) ; AVX1-NEXT: vmovapd %ymm0, 128(%rax) ; AVX1-NEXT: vmovaps %ymm9, 64(%rax) -; AVX1-NEXT: vmovapd %ymm2, 32(%rax) -; AVX1-NEXT: vmovapd %ymm3, 160(%rax) +; AVX1-NEXT: vmovapd %ymm5, 32(%rax) +; AVX1-NEXT: vmovapd %ymm2, 160(%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: store_i64_stride6_vf4: ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-NEXT: vmovaps (%rsi), %ymm11 +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-NEXT: vmovaps (%rdx), %ymm2 ; AVX2-NEXT: vmovaps (%rcx), %ymm3 ; AVX2-NEXT: vmovaps (%r8), %ymm4 ; AVX2-NEXT: vmovaps (%r9), %xmm5 ; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm6 ; AVX2-NEXT: vmovaps (%rcx), %xmm7 -; AVX2-NEXT: vmovaps (%rdx), %xmm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm0[1],xmm7[1] +; AVX2-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] ; AVX2-NEXT: vbroadcastsd 8(%r8), %ymm10 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vmovddup {{.*#+}} xmm5 = xmm5[0,0] -; AVX2-NEXT: vmovaps (%rsi), %xmm6 -; AVX2-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm1[1],xmm6[1] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[0,1],ymm10[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX2-NEXT: vmovaps (%rsi), %xmm9 +; AVX2-NEXT: vmovaps (%rdi), %xmm10 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm4[0,1],ymm11[0,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm3 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX2-NEXT: vbroadcastsd 16(%r9), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; AVX2-NEXT: vmovaps %xmm0, 16(%rax) +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] +; AVX2-NEXT: vbroadcastsd 16(%r9), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm9[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm8[0],xmm7[0] +; AVX2-NEXT: vmovaps %xmm3, 16(%rax) ; AVX2-NEXT: vmovaps %xmm1, (%rax) -; AVX2-NEXT: vmovaps %ymm10, 96(%rax) -; AVX2-NEXT: vmovaps %ymm3, 128(%rax) +; AVX2-NEXT: vmovaps %ymm11, 96(%rax) +; AVX2-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-NEXT: vmovaps %ymm2, 160(%rax) ; AVX2-NEXT: vmovaps %ymm5, 32(%rax) -; AVX2-NEXT: vmovaps %ymm9, 64(%rax) +; AVX2-NEXT: vmovaps %ymm6, 64(%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -309,96 +309,98 @@ ; SSE-LABEL: store_i64_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 16(%rdi), %xmm10 -; SSE-NEXT: movaps 32(%rdi), %xmm15 -; SSE-NEXT: movaps (%rsi), %xmm11 -; SSE-NEXT: movaps 16(%rsi), %xmm4 -; SSE-NEXT: movaps 32(%rsi), %xmm8 -; SSE-NEXT: movaps (%rdx), %xmm12 -; SSE-NEXT: movaps 16(%rdx), %xmm9 -; SSE-NEXT: movaps 32(%rdx), %xmm14 -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps 16(%rcx), %xmm6 +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps 32(%rdi), %xmm3 +; SSE-NEXT: movaps (%rsi), %xmm9 +; SSE-NEXT: movaps 16(%rsi), %xmm13 +; SSE-NEXT: movaps 32(%rsi), %xmm12 +; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: movaps 16(%rdx), %xmm4 +; SSE-NEXT: movaps 32(%rdx), %xmm7 +; SSE-NEXT: movaps (%rcx), %xmm10 +; SSE-NEXT: movaps 16(%rcx), %xmm14 ; SSE-NEXT: movaps (%r8), %xmm5 -; SSE-NEXT: movaps 16(%r8), %xmm13 -; SSE-NEXT: movaps (%r9), %xmm0 -; SSE-NEXT: movaps 16(%r9), %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm11[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: movaps 16(%r8), %xmm8 +; SSE-NEXT: movaps (%r9), %xmm11 +; SSE-NEXT: movaps 16(%r9), %xmm15 +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0] +; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1] -; SSE-NEXT: movaps 32(%r8), %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm14[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm15[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm12 +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%r8), %xmm12 ; SSE-NEXT: movaps 32(%r9), %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps 48(%rdi), %xmm6 -; SSE-NEXT: movaps 48(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-NEXT: movaps 48(%rdx), %xmm3 -; SSE-NEXT: movaps 48(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps 48(%r8), %xmm2 +; SSE-NEXT: movaps %xmm12, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 48(%rdi), %xmm5 +; SSE-NEXT: movaps 48(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-NEXT: movaps 48(%rdx), %xmm2 +; SSE-NEXT: movaps 48(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps 48(%r8), %xmm3 ; SSE-NEXT: movaps 48(%r9), %xmm4 -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 368(%rax) -; SSE-NEXT: movaps %xmm3, 352(%rax) -; SSE-NEXT: movaps %xmm6, 336(%rax) +; SSE-NEXT: movaps %xmm3, 368(%rax) +; SSE-NEXT: movaps %xmm2, 352(%rax) +; SSE-NEXT: movaps %xmm5, 336(%rax) ; SSE-NEXT: movaps %xmm0, 320(%rax) ; SSE-NEXT: movaps %xmm1, 304(%rax) -; SSE-NEXT: movaps %xmm7, 288(%rax) -; SSE-NEXT: movaps %xmm5, 272(%rax) -; SSE-NEXT: movaps %xmm14, 256(%rax) +; SSE-NEXT: movaps %xmm6, 288(%rax) +; SSE-NEXT: movaps %xmm12, 272(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rax) -; SSE-NEXT: movaps %xmm8, 224(%rax) -; SSE-NEXT: movaps %xmm12, 208(%rax) +; SSE-NEXT: movaps %xmm7, 224(%rax) +; SSE-NEXT: movaps %xmm8, 208(%rax) ; SSE-NEXT: movaps %xmm15, 192(%rax) -; SSE-NEXT: movaps %xmm13, 176(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: movaps %xmm10, 128(%rax) -; SSE-NEXT: movaps %xmm9, 112(%rax) +; SSE-NEXT: movaps %xmm14, 128(%rax) +; SSE-NEXT: movaps %xmm13, 112(%rax) ; SSE-NEXT: movaps %xmm11, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) @@ -406,10 +408,8 @@ ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm10, 32(%rax) +; SSE-NEXT: movaps %xmm9, 16(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $24, %rsp @@ -417,7 +417,7 @@ ; ; AVX1-LABEL: store_i64_stride6_vf8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd (%rdi), %ymm15 +; AVX1-NEXT: vmovapd (%rdi), %ymm7 ; AVX1-NEXT: vmovapd 32(%rdi), %ymm12 ; AVX1-NEXT: vmovapd (%rsi), %ymm9 ; AVX1-NEXT: vmovapd 32(%rsi), %ymm13 @@ -437,75 +437,75 @@ ; AVX1-NEXT: vmovaps 32(%rsi), %xmm6 ; AVX1-NEXT: vmovaps (%rdi), %xmm4 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX1-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm0[1],xmm6[1] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-NEXT: vblendpd {{.*#+}} ymm7 = ymm14[0],ymm7[1,2,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3] -; AVX1-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm8[1],xmm6[1] +; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-NEXT: vblendpd {{.*#+}} ymm15 = ymm14[0],ymm15[1,2,3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2,3] +; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] -; AVX1-NEXT: vbroadcastsd 16(%rcx), %ymm7 -; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm11 = ymm1[0],ymm7[1],ymm1[2,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm13[2,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm7[2,3] -; AVX1-NEXT: vshufpd {{.*#+}} ymm12 = ymm7[0],ymm1[0],ymm7[2],ymm1[3] +; AVX1-NEXT: vbroadcastsd 16(%rcx), %ymm11 +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] +; AVX1-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm4[1],xmm3[1] +; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-NEXT: vblendpd {{.*#+}} ymm15 = ymm10[0],ymm15[1,2,3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm2[2,3],ymm13[2,3] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm15[0],ymm12[2],ymm15[3] ; AVX1-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-NEXT: vbroadcastsd 40(%r8), %ymm7 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7] -; AVX1-NEXT: vinsertf128 $1, 32(%r9), %ymm14, %ymm7 -; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm7[2,3],ymm1[4,5],ymm7[6,7] -; AVX1-NEXT: vmovapd (%r9), %ymm1 -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm15[1],ymm9[1],ymm15[3],ymm9[3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm1[2,3],ymm9[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,3,2,3] +; AVX1-NEXT: vbroadcastsd 40(%r8), %ymm15 +; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5],ymm13[6,7] +; AVX1-NEXT: vinsertf128 $1, 32(%r9), %ymm14, %ymm15 +; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] +; AVX1-NEXT: vmovapd (%r9), %ymm15 +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] ; AVX1-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[3] ; AVX1-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,3,2,3] ; AVX1-NEXT: vbroadcastsd 8(%r8), %ymm10 ; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] ; AVX1-NEXT: vmovaps (%rcx), %xmm10 -; AVX1-NEXT: vinsertf128 $1, (%r9), %ymm10, %ymm15 -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3],ymm9[4,5],ymm15[6,7] -; AVX1-NEXT: vmovapd 48(%rdx), %xmm5 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] -; AVX1-NEXT: vbroadcastsd 56(%r8), %ymm15 -; AVX1-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3] -; AVX1-NEXT: vmovapd 16(%rdx), %xmm5 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] -; AVX1-NEXT: vbroadcastsd 24(%r8), %ymm15 -; AVX1-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; AVX1-NEXT: vmovaps 32(%rdx), %xmm5 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm14[0] +; AVX1-NEXT: vinsertf128 $1, (%r9), %ymm10, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7] +; AVX1-NEXT: vmovapd 48(%rdx), %xmm9 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],mem[1] +; AVX1-NEXT: vbroadcastsd 56(%r8), %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2],ymm9[3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-NEXT: vmovapd 16(%rdx), %xmm2 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-NEXT: vbroadcastsd 24(%r8), %ymm9 +; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm15[3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm6[0] +; AVX1-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0] ; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX1-NEXT: vmovaps (%rdx), %xmm4 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm10[0] ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-NEXT: vmovaps %xmm4, 16(%rax) ; AVX1-NEXT: vmovaps %xmm3, (%rax) -; AVX1-NEXT: vmovaps %xmm5, 208(%rax) -; AVX1-NEXT: vmovaps %xmm0, 192(%rax) -; AVX1-NEXT: vmovaps %ymm9, 64(%rax) +; AVX1-NEXT: vmovaps %xmm8, 208(%rax) +; AVX1-NEXT: vmovaps %xmm6, 192(%rax) +; AVX1-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-NEXT: vmovapd %ymm7, 128(%rax) ; AVX1-NEXT: vmovaps %ymm13, 256(%rax) ; AVX1-NEXT: vmovapd %ymm12, 320(%rax) ; AVX1-NEXT: vmovapd %ymm11, 32(%rax) -; AVX1-NEXT: vmovaps %ymm8, 96(%rax) -; AVX1-NEXT: vmovapd %ymm1, 160(%rax) +; AVX1-NEXT: vmovaps %ymm5, 96(%rax) +; AVX1-NEXT: vmovapd %ymm2, 160(%rax) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-NEXT: vmovapd %ymm2, 352(%rax) +; AVX1-NEXT: vmovapd %ymm1, 352(%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -513,87 +513,87 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rax ; AVX2-NEXT: vmovaps 32(%rdx), %ymm7 -; AVX2-NEXT: vmovaps (%r8), %ymm11 +; AVX2-NEXT: vmovaps (%r8), %ymm4 ; AVX2-NEXT: vmovaps 32(%r8), %ymm13 -; AVX2-NEXT: vmovaps (%r9), %xmm8 -; AVX2-NEXT: vmovaps 32(%r9), %xmm0 -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-NEXT: vmovaps (%rcx), %xmm5 -; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 32(%rcx), %xmm15 -; AVX2-NEXT: vmovaps (%rdx), %xmm3 -; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 32(%rdx), %xmm12 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm12[1],xmm15[1] -; AVX2-NEXT: vbroadcastsd 40(%r8), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = xmm0[0,0] -; AVX2-NEXT: vmovaps (%rsi), %xmm4 -; AVX2-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX2-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm2[1],xmm1[1] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[0,1],ymm10[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vmovaps (%r9), %xmm14 +; AVX2-NEXT: vmovaps 32(%r9), %xmm6 +; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-NEXT: vmovaps (%rcx), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 32(%rcx), %xmm3 +; AVX2-NEXT: vmovaps (%rdx), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 32(%rdx), %xmm5 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm5[1],xmm3[1] +; AVX2-NEXT: vbroadcastsd 40(%r8), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm3[1],xmm5[1] -; AVX2-NEXT: vbroadcastsd 8(%r8), %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5],ymm9[6,7] -; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = xmm6[0,0] +; AVX2-NEXT: vmovaps (%rsi), %xmm8 +; AVX2-NEXT: vmovaps 32(%rsi), %xmm11 +; AVX2-NEXT: vmovaps (%rdi), %xmm10 +; AVX2-NEXT: vmovaps 32(%rdi), %xmm12 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm12[1],xmm11[1] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm13[0,1],ymm9[0,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm6[2,3],ymm9[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = xmm8[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm6[1],xmm4[1] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[0,1],ymm14[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm3[2,3],ymm14[4,5,6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-NEXT: vbroadcastsd 56(%r8), %ymm8 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[2,3],ymm8[2,3] -; AVX2-NEXT: vmovaps 32(%rdi), %ymm8 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm2[1],xmm1[1] +; AVX2-NEXT: vbroadcastsd 8(%r8), %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm15[4,5],ymm9[6,7] +; AVX2-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovddup {{.*#+}} xmm14 = xmm14[0,0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm10[1],xmm8[1] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[0,1],ymm15[0,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX2-NEXT: vbroadcastsd 56(%r8), %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm15[2,3],ymm0[2,3] +; AVX2-NEXT: vmovaps 32(%rdi), %ymm15 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm13[2,3],ymm6[2,3] ; AVX2-NEXT: vbroadcastsd 48(%r9), %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm7[2,3] ; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX2-NEXT: vmovaps (%rdx), %ymm7 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm13 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm13[2,3] -; AVX2-NEXT: vmovaps (%rdi), %ymm13 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm15 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] +; AVX2-NEXT: vmovaps (%rdi), %ymm15 ; AVX2-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX2-NEXT: vbroadcastsd 16(%r9), %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] +; AVX2-NEXT: vbroadcastsd 16(%r9), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm7[2,3] ; AVX2-NEXT: vbroadcastsd 16(%rcx), %ymm7 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm12[0],xmm15[0] -; AVX2-NEXT: vmovlhps {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-NEXT: # xmm6 = xmm6[0],mem[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm11[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm5[0],xmm3[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm5 = xmm10[0],xmm8[0] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-NEXT: # xmm8 = xmm8[0],mem[0] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovaps %xmm6, 16(%rax) -; AVX2-NEXT: vmovaps %xmm4, (%rax) -; AVX2-NEXT: vmovaps %xmm2, 208(%rax) -; AVX2-NEXT: vmovaps %xmm1, 192(%rax) +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovaps %xmm8, 16(%rax) +; AVX2-NEXT: vmovaps %xmm5, (%rax) +; AVX2-NEXT: vmovaps %xmm3, 208(%rax) +; AVX2-NEXT: vmovaps %xmm7, 192(%rax) ; AVX2-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-NEXT: vmovaps %ymm9, 128(%rax) -; AVX2-NEXT: vmovaps %ymm7, 160(%rax) -; AVX2-NEXT: vmovaps %ymm3, 288(%rax) -; AVX2-NEXT: vmovaps %ymm10, 320(%rax) -; AVX2-NEXT: vmovaps %ymm5, 352(%rax) +; AVX2-NEXT: vmovaps %ymm4, 128(%rax) +; AVX2-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-NEXT: vmovaps %ymm1, 288(%rax) +; AVX2-NEXT: vmovaps %ymm6, 320(%rax) +; AVX2-NEXT: vmovaps %ymm2, 352(%rax) ; AVX2-NEXT: vmovaps %ymm14, 32(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 64(%rax) @@ -607,23 +607,23 @@ ; ; AVX512-LABEL: store_i64_stride6_vf8: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm4 ; AVX512-NEXT: vmovdqu64 (%rsi), %zmm6 ; AVX512-NEXT: vmovdqu64 (%rdx), %zmm2 ; AVX512-NEXT: vmovdqu64 (%rcx), %zmm3 -; AVX512-NEXT: vmovdqu64 (%r8), %zmm10 +; AVX512-NEXT: vmovdqu64 (%r8), %zmm1 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: movb $12, %al -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: movb $12, %r10b +; AVX512-NEXT: kmovd %r10d, %k1 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512-NEXT: movb $16, %al -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm0 {%k2} +; AVX512-NEXT: movb $16, %r10b +; AVX512-NEXT: kmovd %r10d, %k2 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} ; AVX512-NEXT: vmovdqu64 (%r9), %zmm5 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -631,11 +631,11 @@ ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512-NEXT: movb $48, %al -; AVX512-NEXT: kmovd %eax, %k2 +; AVX512-NEXT: movb $48, %r9b +; AVX512-NEXT: kmovd %r9d, %k2 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,9,u,4,5,6,7> -; AVX512-NEXT: vpermi2q %zmm10, %zmm8, %zmm7 +; AVX512-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] @@ -646,47 +646,47 @@ ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,13,u,4,5,6,7> -; AVX512-NEXT: vpermi2q %zmm10, %zmm9, %zmm7 +; AVX512-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,1,9,0,8,1,9] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 -; AVX512-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k1} -; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,8,6,7] -; AVX512-NEXT: vpermi2q %zmm5, %zmm1, %zmm7 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} +; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] +; AVX512-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = ; AVX512-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm11[0,1,2,3],zmm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <14,u,2,3,4,5,15,u> -; AVX512-NEXT: vpermi2q %zmm10, %zmm1, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,14,2,3,4,5,6,15] -; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u> +; AVX512-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] +; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11] +; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> -; AVX512-NEXT: vpermi2q %zmm10, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,10,2,3,4,5,6,11] -; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,12,6,7] -; AVX512-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, 192(%r10) -; AVX512-NEXT: vmovdqu64 %zmm2, 128(%r10) -; AVX512-NEXT: vmovdqu64 %zmm1, 320(%r10) -; AVX512-NEXT: vmovdqu64 %zmm9, 256(%r10) -; AVX512-NEXT: vmovdqu64 %zmm8, 64(%r10) -; AVX512-NEXT: vmovdqu64 %zmm7, (%r10) +; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, 192(%rax) +; AVX512-NEXT: vmovdqu64 %zmm1, 128(%rax) +; AVX512-NEXT: vmovdqu64 %zmm4, 320(%rax) +; AVX512-NEXT: vmovdqu64 %zmm9, 256(%rax) +; AVX512-NEXT: vmovdqu64 %zmm8, 64(%rax) +; AVX512-NEXT: vmovdqu64 %zmm10, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 32 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -348,161 +348,161 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm9 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa (%rsi), %xmm13 -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa (%rdx), %xmm8 -; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm10 +; SSE-NEXT: movdqa (%rdx), %xmm1 +; SSE-NEXT: movdqa 16(%rdx), %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,6] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,5] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm6[0,1,2,3,4,5,5,6] +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,1,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,6,6] +; SSE-NEXT: pandn %xmm9, %xmm6 +; SSE-NEXT: por %xmm11, %xmm6 +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] +; SSE-NEXT: pand %xmm9, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm13, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: pandn %xmm10, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,6,6] +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: por %xmm12, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,6,6] ; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,1,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,6,6] -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm10, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,7,7] +; SSE-NEXT: pandn %xmm10, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,6] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,6] +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: movdqa %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm9, 32(%rcx) ; SSE-NEXT: movdqa %xmm7, 48(%rcx) -; SSE-NEXT: movdqa %xmm1, 80(%rcx) +; SSE-NEXT: movdqa %xmm11, 80(%rcx) ; SSE-NEXT: movdqa %xmm6, 16(%rcx) -; SSE-NEXT: movdqa %xmm12, 64(%rcx) +; SSE-NEXT: movdqa %xmm3, 64(%rcx) ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i8_stride3_vf32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vmovdqa (%rsi), %xmm8 +; AVX1-NEXT: vmovdqa (%rsi), %xmm2 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm3 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm8[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vmovdqa (%rdx), %xmm6 ; AVX1-NEXT: vmovdqa 16(%rdx), %xmm7 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm5 -; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 64(%rcx) -; AVX1-NEXT: vmovdqa %xmm1, 80(%rcx) -; AVX1-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 64(%rcx) +; AVX1-NEXT: vmovdqa %xmm4, 80(%rcx) +; AVX1-NEXT: vmovdqa %xmm5, 32(%rcx) ; AVX1-NEXT: vmovdqa %xmm3, 48(%rcx) -; AVX1-NEXT: vmovdqa %xmm6, (%rcx) -; AVX1-NEXT: vmovdqa %xmm5, 16(%rcx) +; AVX1-NEXT: vmovdqa %xmm2, (%rcx) +; AVX1-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: store_i8_stride3_vf32: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll @@ -278,43 +278,43 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rsi), %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm3 +; SSE-NEXT: movdqa 16(%rsi), %xmm6 +; SSE-NEXT: movdqa (%rdx), %xmm7 ; SSE-NEXT: movdqa 16(%rdx), %xmm4 -; SSE-NEXT: movdqa (%rcx), %xmm6 +; SSE-NEXT: movdqa (%rcx), %xmm8 ; SSE-NEXT: movdqa 16(%rcx), %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: movdqa %xmm1, 96(%r8) -; SSE-NEXT: movdqa %xmm3, 112(%r8) -; SSE-NEXT: movdqa %xmm6, 64(%r8) -; SSE-NEXT: movdqa %xmm7, 80(%r8) +; SSE-NEXT: movdqa %xmm6, 112(%r8) +; SSE-NEXT: movdqa %xmm8, 64(%r8) +; SSE-NEXT: movdqa %xmm10, 80(%r8) ; SSE-NEXT: movdqa %xmm0, 32(%r8) ; SSE-NEXT: movdqa %xmm5, 48(%r8) ; SSE-NEXT: movdqa %xmm2, (%r8) -; SSE-NEXT: movdqa %xmm10, 16(%r8) +; SSE-NEXT: movdqa %xmm3, 16(%r8) ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i8_stride4_vf32: @@ -325,32 +325,32 @@ ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX1-NEXT: vmovdqa (%rcx), %xmm2 ; AVX1-NEXT: vmovdqa 16(%rcx), %xmm3 ; AVX1-NEXT: vmovdqa (%rdx), %xmm6 ; AVX1-NEXT: vmovdqa 16(%rdx), %xmm7 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vmovaps %ymm0, (%r8) -; AVX1-NEXT: vmovaps %ymm3, 96(%r8) -; AVX1-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, (%r8) +; AVX1-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-NEXT: vmovaps %ymm3, 64(%r8) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -249,17 +249,17 @@ ; SSE-LABEL: store_i8_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{.*#+}} xmm9 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; SSE-NEXT: movq {{.*#+}} xmm10 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; SSE-NEXT: movq {{.*#+}} xmm8 = mem[0],zero +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] ; SSE-NEXT: movdqa %xmm3, %xmm5 @@ -269,43 +269,43 @@ ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm6, %xmm7 ; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,0,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: por %xmm7, %xmm10 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[3,3,3,3] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,3] -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm1, %xmm6 ; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,2,2] -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm1, 32(%rax) +; SSE-NEXT: movdqa %xmm8, 32(%rax) ; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm2, (%rax) +; SSE-NEXT: movdqa %xmm10, (%rax) ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i8_stride6_vf8: @@ -429,142 +429,141 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm9 -; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa (%rdx), %xmm11 -; SSE-NEXT: movdqa (%rcx), %xmm14 -; SSE-NEXT: movdqa (%r8), %xmm15 -; SSE-NEXT: movdqa (%r9), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa (%rsi), %xmm15 +; SSE-NEXT: movdqa (%rdx), %xmm9 +; SSE-NEXT: movdqa (%rcx), %xmm13 +; SSE-NEXT: movdqa (%r8), %xmm7 +; SSE-NEXT: movdqa (%r9), %xmm14 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[1,0,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm8 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: por %xmm2, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: por %xmm11, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: por %xmm6, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm11, %xmm8 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: pand %xmm11, %xmm8 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,2,2] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[3,3,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,3] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] -; SSE-NEXT: pandn %xmm5, %xmm13 -; SSE-NEXT: por %xmm4, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,0,1,1] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: pshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[1,1,2,2] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: pandn %xmm15, %xmm13 +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm8, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm14[8],xmm8[9],xmm14[9],xmm8[10],xmm14[10],xmm8[11],xmm14[11],xmm8[12],xmm14[12],xmm8[13],xmm14[13],xmm8[14],xmm14[14],xmm8[15],xmm14[15] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,2,2] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: pand %xmm15, %xmm13 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm2, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm13, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm13, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: por %xmm3, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,0,0,0] +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,2,2] -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,2,2] +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: pshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm14, 16(%rax) -; SSE-NEXT: movdqa %xmm8, 48(%rax) -; SSE-NEXT: movdqa %xmm13, 80(%rax) -; SSE-NEXT: movdqa %xmm2, 64(%rax) -; SSE-NEXT: movdqa %xmm10, 32(%rax) +; SSE-NEXT: movdqa %xmm15, 16(%rax) +; SSE-NEXT: movdqa %xmm6, 48(%rax) +; SSE-NEXT: movdqa %xmm11, 80(%rax) +; SSE-NEXT: movdqa %xmm14, 64(%rax) +; SSE-NEXT: movdqa %xmm12, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: retq @@ -572,60 +571,60 @@ ; AVX1-LABEL: store_i8_stride6_vf16: ; AVX1: # %bb.0: ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: vmovdqa (%rdi), %xmm10 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-NEXT: vmovdqa (%rsi), %xmm2 ; AVX1-NEXT: vmovdqa (%rdx), %xmm3 ; AVX1-NEXT: vmovdqa (%rcx), %xmm4 -; AVX1-NEXT: vmovdqa (%r8), %xmm8 -; AVX1-NEXT: vmovdqa (%r9), %xmm9 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; AVX1-NEXT: vmovdqa (%r8), %xmm5 +; AVX1-NEXT: vmovdqa (%r9), %xmm6 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2],xmm0[3,4],xmm6[5],xmm0[6,7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[1,0,2,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2],xmm0[3,4],xmm9[5],xmm0[6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5],xmm10[6],xmm0[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[1,0,2,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6],xmm11[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,0,0,0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[1,0,2,2,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6],xmm5[7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,5,6,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6],xmm5[7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[3,3,3,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,6,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,5,6,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[1,1,2,2] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-NEXT: vmovaps %ymm11, (%rax) +; AVX1-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-NEXT: vmovaps %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -738,271 +737,268 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride6_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $56, %rsp -; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa 16(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm8 -; SSE-NEXT: movdqa 16(%rcx), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r8), %xmm13 -; SSE-NEXT: movdqa 16(%r9), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] +; SSE-NEXT: subq $40, %rsp +; SSE-NEXT: movdqa 16(%rdi), %xmm10 +; SSE-NEXT: movdqa 16(%rsi), %xmm9 +; SSE-NEXT: movdqa 16(%rdx), %xmm13 +; SSE-NEXT: movdqa 16(%rcx), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r8), %xmm2 +; SSE-NEXT: movdqa 16(%r9), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,2,2,3] ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: por %xmm6, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm12, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm15, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm6 -; SSE-NEXT: pandn %xmm9, %xmm6 -; SSE-NEXT: pand %xmm12, %xmm7 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm7, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm5, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm15 +; SSE-NEXT: por %xmm15, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[1,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm6 ; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,0,1,1] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[1,1,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: pand %xmm7, %xmm6 ; SSE-NEXT: por %xmm6, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa (%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,0,0,0] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[3,3,3,3] ; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pand %xmm10, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa (%rdx), %xmm13 -; SSE-NEXT: movdqa (%rcx), %xmm3 +; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: pand %xmm12, %xmm8 +; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: por %xmm8, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa (%r8), %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: pand %xmm11, %xmm8 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,0,1,1] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa (%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movdqa (%r9), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm11, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: movdqa (%rcx), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,3] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: movdqa (%r8), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa (%r9), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 ; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pand %xmm11, %xmm9 +; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: por %xmm10, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,0,1,1] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm10, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: pand %xmm7, %xmm14 +; SSE-NEXT: por %xmm14, %xmm8 ; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] -; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: por %xmm4, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,3] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,2,2] +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: pand %xmm12, %xmm10 +; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,2,3,3] ; SSE-NEXT: pandn %xmm3, %xmm12 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: pand %xmm14, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] -; SSE-NEXT: pandn %xmm3, %xmm14 -; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,1,1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[1,0,2,2,4,5,6,7] +; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,2,2] +; SSE-NEXT: pandn %xmm3, %xmm11 +; SSE-NEXT: por %xmm10, %xmm11 +; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm11, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,1,1] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,0,0] -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm10, (%rax) -; SSE-NEXT: movdqa %xmm14, 16(%rax) -; SSE-NEXT: movdqa %xmm15, 32(%rax) -; SSE-NEXT: movdqa %xmm7, 48(%rax) -; SSE-NEXT: movdqa %xmm11, 64(%rax) +; SSE-NEXT: movdqa %xmm7, (%rax) +; SSE-NEXT: movdqa %xmm5, 16(%rax) +; SSE-NEXT: movdqa %xmm12, 32(%rax) +; SSE-NEXT: movdqa %xmm8, 48(%rax) +; SSE-NEXT: movdqa %xmm9, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1011,184 +1007,183 @@ ; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rax) -; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i8_stride6_vf32: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rax ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-NEXT: vandnps %ymm1, %ymm7, %ymm2 +; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-NEXT: vandnps %ymm1, %ymm9, %ymm2 ; AVX1-NEXT: vmovdqa 16(%rcx), %xmm1 ; AVX1-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,5,6,7,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,6,7,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,2,2,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vorps %ymm2, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovdqa 16(%r8), %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10],zero,xmm4[12,13,14,15] -; AVX1-NEXT: vmovdqa 16(%r9), %xmm5 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm5[8],zero,zero,zero,zero,zero,xmm5[9],zero,zero,zero,zero -; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa %xmm4, %xmm9 -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm5[5],zero,zero,zero,zero,zero,xmm5[6],zero,zero,zero,zero,zero,xmm5[7] +; AVX1-NEXT: vandps %ymm3, %ymm9, %ymm3 +; AVX1-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vmovdqa 16(%r8), %xmm12 +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm12[8,u],zero,zero,zero,zero,xmm12[9,u],zero,zero,zero,zero +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4],zero,xmm3[6,7,8,9,10],zero,xmm3[12,13,14,15] +; AVX1-NEXT: vmovdqa 16(%r9), %xmm14 +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm14[8],zero,zero,zero,zero,zero,xmm14[9],zero,zero,zero,zero ; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm12[5,u],zero,zero,zero,zero,xmm12[6,u],zero,zero,zero,zero,xmm12[7,u] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa %xmm3, %xmm4 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm14[5],zero,zero,zero,zero,zero,xmm14[6],zero,zero,zero,zero,zero,xmm14[7] +; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-NEXT: vandps %ymm7, %ymm0, %ymm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vandnps %ymm1, %ymm7, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm9, %ymm1 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[13,u],zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,xmm2[15,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] -; AVX1-NEXT: vmovdqa %xmm9, %xmm4 -; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm5[13],zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,xmm5[15] -; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> +; AVX1-NEXT: vpshufb %xmm15, %xmm12, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm14[13],zero,zero,zero,zero,zero,xmm14[14],zero,zero,zero,zero,zero,xmm14[15] +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[10,u],zero,zero,zero,zero,xmm12[11,u],zero,zero,zero,zero,xmm12[12,u],zero,zero ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2,3,4,5,6],zero,xmm0[8,9,10,11,12],zero,xmm0[14,15] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm5[10],zero,zero,zero,zero,zero,xmm5[11],zero,zero,zero,zero,zero,xmm5[12],zero,zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [128,10,128,128,128,128,128,11,128,128,128,128,128,12,128,128] +; AVX1-NEXT: vpshufb %xmm8, %xmm14, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rsi), %xmm10 -; AVX1-NEXT: vmovdqa (%rdi), %xmm9 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[1,1,2,2] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] +; AVX1-NEXT: vmovdqa (%rsi), %xmm3 +; AVX1-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovdqa (%rcx), %xmm15 -; AVX1-NEXT: vmovdqa (%rdx), %xmm13 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vmovdqa (%rcx), %xmm7 +; AVX1-NEXT: vmovdqa (%rdx), %xmm6 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm14 = xmm11[0,1,2,3,5,6,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[2,2,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm7, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm6 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,5,6,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-NEXT: vandnps %ymm1, %ymm9, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm10 ; AVX1-NEXT: vmovdqa (%r8), %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm1[13,u],zero,zero,zero,zero,xmm1[14,u],zero,zero,zero,zero,xmm1[15,u] -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3],xmm0[4],xmm7[5,6],xmm0[7] -; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm7 +; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm15 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6],xmm0[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8],zero,xmm0[10,11,12,13,14],zero ; AVX1-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm0[13],zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,xmm0[15] -; AVX1-NEXT: vpor %xmm4, %xmm7, %xmm3 -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] -; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero -; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm3 -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm3[1,1,2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[1,0,2,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-NEXT: vandps %ymm7, %ymm4, %ymm4 -; AVX1-NEXT: vandnps %ymm6, %ymm7, %ymm6 -; AVX1-NEXT: vorps %ymm6, %ymm4, %ymm4 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,u],zero,zero,zero,zero,xmm2[3,u],zero,zero,zero,zero,xmm2[4,u],zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5],xmm3[6],xmm6[7] -; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[2],zero,zero,zero,zero,zero,xmm5[3],zero,zero,zero,zero,zero,xmm5[4],zero,zero -; AVX1-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[0,u],zero,zero,zero,zero,xmm2[1,u],zero,zero,zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm3 -; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,0,1,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[1,1,2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[1,0,2,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-NEXT: vandnps %ymm4, %ymm7, %ymm4 -; AVX1-NEXT: vorps %ymm4, %ymm2, %ymm4 +; AVX1-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm0[13],zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,xmm0[15] +; AVX1-NEXT: vpor %xmm9, %xmm15, %xmm9 +; AVX1-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm8 +; AVX1-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[1,0,2,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX1-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-NEXT: vandps %ymm13, %ymm8, %ymm8 +; AVX1-NEXT: vandnps %ymm9, %ymm13, %ymm9 +; AVX1-NEXT: vorps %ymm9, %ymm8, %ymm8 +; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm9 +; AVX1-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[2,u],zero,zero,zero,zero,xmm12[3,u],zero,zero,zero,zero,xmm12[4,u],zero,zero +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1,2],xmm11[3],xmm9[4,5],xmm11[6],xmm9[7] +; AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] +; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm15 +; AVX1-NEXT: vpor %xmm15, %xmm9, %xmm9 +; AVX1-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> +; AVX1-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm12[2],xmm8[3,4],xmm12[5],xmm8[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm9, %xmm14, %xmm12 +; AVX1-NEXT: vpor %xmm12, %xmm8, %xmm12 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[1,0,2,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-NEXT: vandps %ymm2, %ymm13, %ymm2 +; AVX1-NEXT: vandnps %ymm7, %ymm13, %ymm7 +; AVX1-NEXT: vorps %ymm7, %ymm2, %ymm7 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[2,u],zero,zero,zero,zero,xmm1[3,u],zero,zero,zero,zero,xmm1[4,u],zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5],xmm2[6],xmm6[7] +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1,2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5,6],zero,xmm2[8,9,10,11,12],zero,xmm2[14,15] -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[2],zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero -; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm1[0,u],zero,zero,zero,zero,xmm1[1,u],zero,zero,zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] -; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm14, %xmm0, %xmm6 -; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,0,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[1,0,2,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-NEXT: vandnps %ymm5, %ymm6, %ymm5 -; AVX1-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-NEXT: vorps %ymm5, %ymm3, %ymm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] -; AVX1-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero -; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm11, %xmm0, %xmm8 +; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm8 +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] +; AVX1-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm8 +; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,5,6,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,2,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] +; AVX1-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero +; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7,8],zero,xmm1[10,11,12,13,14],zero @@ -1196,11 +1191,10 @@ ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX1-NEXT: vmovdqa %xmm5, 48(%rax) -; AVX1-NEXT: vmovdqa %xmm4, (%rax) +; AVX1-NEXT: vmovdqa %xmm4, 48(%rax) +; AVX1-NEXT: vmovdqa %xmm7, (%rax) ; AVX1-NEXT: vmovdqa %xmm2, 16(%rax) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm0, 96(%rax) +; AVX1-NEXT: vmovdqa %xmm12, 96(%rax) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm0, 112(%rax) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1215,388 +1209,372 @@ ; AVX1-NEXT: vmovaps %xmm0, 128(%rax) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm0, 144(%rax) -; AVX1-NEXT: popq %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride6_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $72, %rsp -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-SLOW-NEXT: subq $40, %rsp +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm9 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm8 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm10, %ymm14 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm10 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm2 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[1],ymm10[1],ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[4],ymm10[4],ymm2[5],ymm10[5],ymm2[6],ymm10[6],ymm2[7],ymm10[7],ymm2[16],ymm10[16],ymm2[17],ymm10[17],ymm2[18],ymm10[18],ymm2[19],ymm10[19],ymm2[20],ymm10[20],ymm2[21],ymm10[21],ymm2[22],ymm10[22],ymm2[23],ymm10[23] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm12 +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm4, %ymm10 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm4, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm15 ; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm10 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm15, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm12, %ymm15, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm3, %ymm15, %ymm13 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm15, %ymm13 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm13 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, %xmm13 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm14, %ymm3 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm14, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm11 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15],ymm6[24],ymm7[24],ymm6[25],ymm7[25],ymm6[26],ymm7[26],ymm6[27],ymm7[27],ymm6[28],ymm7[28],ymm6[29],ymm7[29],ymm6[30],ymm7[30],ymm6[31],ymm7[31] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm5, %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm7, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15],ymm6[24],ymm4[24],ymm6[25],ymm4[25],ymm6[26],ymm4[26],ymm6[27],ymm4[27],ymm6[28],ymm4[28],ymm6[29],ymm4[29],ymm6[30],ymm4[30],ymm6[31],ymm4[31] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm14, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rax) -; AVX2-SLOW-NEXT: addq $72, %rsp +; AVX2-SLOW-NEXT: vmovdqa %ymm11, (%rax) +; AVX2-SLOW-NEXT: addq $40, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride6_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $40, %rsp -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm15 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm2 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm3 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm11 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[1],ymm10[1],ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[4],ymm10[4],ymm2[5],ymm10[5],ymm2[6],ymm10[6],ymm2[7],ymm10[7],ymm2[16],ymm10[16],ymm2[17],ymm10[17],ymm2[18],ymm10[18],ymm2[19],ymm10[19],ymm2[20],ymm10[20],ymm2[21],ymm10[21],ymm2[22],ymm10[22],ymm2[23],ymm10[23] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm10 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23] ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm15, %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm12, %ymm15, %ymm11 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm15, %ymm13 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm15, %ymm13 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15] -; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm13 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm13 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm14 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm7[8],ymm0[8],ymm7[9],ymm0[9],ymm7[10],ymm0[10],ymm7[11],ymm0[11],ymm7[12],ymm0[12],ymm7[13],ymm0[13],ymm7[14],ymm0[14],ymm7[15],ymm0[15],ymm7[24],ymm0[24],ymm7[25],ymm0[25],ymm7[26],ymm0[26],ymm7[27],ymm0[27],ymm7[28],ymm0[28],ymm7[29],ymm0[29],ymm7[30],ymm0[30],ymm7[31],ymm0[31] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm14, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm11 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm14, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $40, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: store_i8_stride6_vf32: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512-NEXT: vmovdqa (%rcx), %ymm11 -; AVX512-NEXT: vmovdqa (%r8), %ymm8 -; AVX512-NEXT: vmovdqa (%r9), %ymm9 -; AVX512-NEXT: vmovdqa (%rsi), %xmm14 +; AVX512-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512-NEXT: vmovdqa (%r8), %ymm0 +; AVX512-NEXT: vmovdqa (%r9), %ymm1 +; AVX512-NEXT: vmovdqa (%rsi), %xmm7 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX512-NEXT: vpshufb %xmm6, %xmm14, %xmm0 -; AVX512-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX512-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm8 +; AVX512-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512-NEXT: vmovdqa (%rcx), %xmm8 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm10, %xmm8, %xmm11 +; AVX512-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512-NEXT: vpshufb %xmm10, %xmm12, %xmm10 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] ; AVX512-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512-NEXT: kmovd %ecx, %k1 -; AVX512-NEXT: vmovdqu16 %ymm0, %ymm2 {%k1} -; AVX512-NEXT: vmovdqa (%r9), %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm5 -; AVX512-NEXT: vmovdqa (%r8), %xmm7 -; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512-NEXT: vmovdqu16 %ymm6, %ymm10 {%k1} +; AVX512-NEXT: vmovdqa (%r9), %xmm11 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm6, %xmm11, %xmm13 +; AVX512-NEXT: vmovdqa (%r8), %xmm14 +; AVX512-NEXT: vpshufb %xmm6, %xmm14, %xmm6 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] ; AVX512-NEXT: movw $9362, %cx # imm = 0x2492 ; AVX512-NEXT: kmovd %ecx, %k2 -; AVX512-NEXT: vmovdqu16 %ymm0, %ymm2 {%k2} -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512-NEXT: vpermw %ymm0, %ymm5, %ymm0 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX512-NEXT: vprold $16, %xmm5, %xmm5 -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX512-NEXT: vmovdqu16 %ymm5, %ymm0 {%k2} -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512-NEXT: vmovdqu16 %ymm6, %ymm10 {%k2} +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512-NEXT: vpermw %ymm6, %ymm13, %ymm6 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; AVX512-NEXT: vprold $16, %xmm13, %xmm13 +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] +; AVX512-NEXT: vmovdqu16 %ymm13, %ymm6 {%k2} +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4] -; AVX512-NEXT: vpermw %ymm5, %ymm15, %ymm0 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[16],ymm13[16],ymm12[17],ymm13[17],ymm12[18],ymm13[18],ymm12[19],ymm13[19],ymm12[20],ymm13[20],ymm12[21],ymm13[21],ymm12[22],ymm13[22],ymm12[23],ymm13[23] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] -; AVX512-NEXT: vpermw %ymm2, %ymm5, %ymm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[16],ymm11[16],ymm10[17],ymm11[17],ymm10[18],ymm11[18],ymm10[19],ymm11[19],ymm10[20],ymm11[20],ymm10[21],ymm11[21],ymm10[22],ymm11[22],ymm10[23],ymm11[23] -; AVX512-NEXT: vprold $16, %ymm5, %ymm5 -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512-NEXT: vmovdqu16 %ymm5, %ymm2 {%k2} -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[16],ymm9[16],ymm8[17],ymm9[17],ymm8[18],ymm9[18],ymm8[19],ymm9[19],ymm8[20],ymm9[20],ymm8[21],ymm9[21],ymm8[22],ymm9[22],ymm8[23],ymm9[23] +; AVX512-NEXT: vpermw %ymm13, %ymm15, %ymm6 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] +; AVX512-NEXT: vpermw %ymm10, %ymm13, %ymm10 +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] +; AVX512-NEXT: vprold $16, %ymm13, %ymm13 +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512-NEXT: vmovdqu16 %ymm13, %ymm10 {%k2} +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [10,9,8,11,10,9,8,11,10,9,8,11,12,12,12,12] -; AVX512-NEXT: vpermw %ymm5, %ymm15, %ymm2 {%k1} -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7] -; AVX512-NEXT: vpermw %ymm1, %ymm5, %ymm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] -; AVX512-NEXT: vpermw %ymm4, %ymm5, %ymm1 {%k1} -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] +; AVX512-NEXT: vpermw %ymm13, %ymm15, %ymm10 {%k1} +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7] +; AVX512-NEXT: vpermw %ymm7, %ymm9, %ymm7 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] +; AVX512-NEXT: vpermw %ymm8, %ymm9, %ymm7 {%k1} +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] ; AVX512-NEXT: movw $-28087, %cx # imm = 0x9249 ; AVX512-NEXT: kmovd %ecx, %k3 -; AVX512-NEXT: vpermw %ymm3, %ymm4, %ymm1 {%k3} -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15],ymm10[24],ymm11[24],ymm10[25],ymm11[25],ymm10[26],ymm11[26],ymm10[27],ymm11[27],ymm10[28],ymm11[28],ymm10[29],ymm11[29],ymm10[30],ymm11[30],ymm10[31],ymm11[31] -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15],ymm12[24],ymm13[24],ymm12[25],ymm13[25],ymm12[26],ymm13[26],ymm12[27],ymm13[27],ymm12[28],ymm13[28],ymm12[29],ymm13[29],ymm12[30],ymm13[30],ymm12[31],ymm13[31] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512-NEXT: vpermw %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512-NEXT: vpermw %ymm2, %ymm4, %ymm3 {%k1} -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15],ymm8[24],ymm9[24],ymm8[25],ymm9[25],ymm8[26],ymm9[26],ymm8[27],ymm9[27],ymm8[28],ymm9[28],ymm8[29],ymm9[29],ymm8[30],ymm9[30],ymm8[31],ymm9[31] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] -; AVX512-NEXT: vpermw %ymm2, %ymm4, %ymm3 {%k3} -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %ymm2, %ymm13, %ymm4 -; AVX512-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %ymm4, %ymm11, %ymm5 -; AVX512-NEXT: vpshufb %ymm4, %ymm10, %ymm4 +; AVX512-NEXT: vpermw %ymm8, %ymm9, %ymm7 {%k3} +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 +; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512-NEXT: vpermw %ymm9, %ymm10, %ymm9 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512-NEXT: vpermw %ymm8, %ymm10, %ymm9 {%k1} +; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] +; AVX512-NEXT: vpermw %ymm8, %ymm10, %ymm9 {%k3} +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX512-NEXT: vpshufb %ymm8, %ymm4, %ymm4 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512-NEXT: vmovdqu16 %ymm2, %ymm4 {%k1} -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %ymm2, %ymm9, %ymm5 -; AVX512-NEXT: vpshufb %ymm2, %ymm8, %ymm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[16],ymm5[16],ymm2[17],ymm5[17],ymm2[18],ymm5[18],ymm2[19],ymm5[19],ymm2[20],ymm5[20],ymm2[21],ymm5[21],ymm2[22],ymm5[22],ymm2[23],ymm5[23] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512-NEXT: vmovdqu16 %ymm2, %ymm4 {%k2} -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, 128(%rax) -; AVX512-NEXT: vmovdqu64 %zmm1, 64(%rax) -; AVX512-NEXT: vmovdqu64 %zmm0, (%rax) +; AVX512-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1} +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512-NEXT: vmovdqu16 %ymm0, %ymm2 {%k2} +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, 128(%rax) +; AVX512-NEXT: vmovdqu64 %zmm7, 64(%rax) +; AVX512-NEXT: vmovdqu64 %zmm6, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 32 diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll --- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll +++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll @@ -51,48 +51,48 @@ ; CHECK-NEXT: shldw $1, %cx, %dx ; CHECK-NEXT: sarl $16, %ecx ; CHECK-NEXT: cmpl $16384, %ecx # imm = 0x4000 -; CHECK-NEXT: movl $32767, %r8d # imm = 0x7FFF -; CHECK-NEXT: cmovgel %r8d, %edx +; CHECK-NEXT: movl $32767, %eax # imm = 0x7FFF +; CHECK-NEXT: cmovgel %eax, %edx ; CHECK-NEXT: cmpl $-16384, %ecx # imm = 0xC000 ; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000 ; CHECK-NEXT: cmovll %ecx, %edx ; CHECK-NEXT: pextrw $1, %xmm0, %esi ; CHECK-NEXT: leal (%rsi,%rsi), %edi -; CHECK-NEXT: movswl %si, %eax -; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: movswl %si, %r8d +; CHECK-NEXT: movl %r8d, %esi ; CHECK-NEXT: shrl $16, %esi ; CHECK-NEXT: shldw $1, %di, %si -; CHECK-NEXT: sarl $16, %eax -; CHECK-NEXT: cmpl $16384, %eax # imm = 0x4000 -; CHECK-NEXT: cmovgel %r8d, %esi -; CHECK-NEXT: cmpl $-16384, %eax # imm = 0xC000 +; CHECK-NEXT: sarl $16, %r8d +; CHECK-NEXT: cmpl $16384, %r8d # imm = 0x4000 +; CHECK-NEXT: cmovgel %eax, %esi +; CHECK-NEXT: cmpl $-16384, %r8d # imm = 0xC000 ; CHECK-NEXT: cmovll %ecx, %esi -; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: cwtl -; CHECK-NEXT: movl %eax, %edi -; CHECK-NEXT: shrl $16, %edi -; CHECK-NEXT: shldw $1, %ax, %di -; CHECK-NEXT: sarl $16, %eax -; CHECK-NEXT: cmpl $16384, %eax # imm = 0x4000 -; CHECK-NEXT: cmovgel %r8d, %edi -; CHECK-NEXT: cmpl $-16384, %eax # imm = 0xC000 -; CHECK-NEXT: cmovll %ecx, %edi -; CHECK-NEXT: movzwl %di, %eax -; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movd %xmm0, %edi +; CHECK-NEXT: movswl %di, %edi +; CHECK-NEXT: movl %edi, %r8d +; CHECK-NEXT: shrl $16, %r8d +; CHECK-NEXT: shldw $1, %di, %r8w +; CHECK-NEXT: sarl $16, %edi +; CHECK-NEXT: cmpl $16384, %edi # imm = 0x4000 +; CHECK-NEXT: cmovgel %eax, %r8d +; CHECK-NEXT: cmpl $-16384, %edi # imm = 0xC000 +; CHECK-NEXT: cmovll %ecx, %r8d +; CHECK-NEXT: movzwl %r8w, %edi +; CHECK-NEXT: movd %edi, %xmm1 ; CHECK-NEXT: pinsrw $1, %esi, %xmm1 ; CHECK-NEXT: pinsrw $2, %edx, %xmm1 -; CHECK-NEXT: pextrw $3, %xmm0, %eax -; CHECK-NEXT: cwtl -; CHECK-NEXT: leal (,%rax,4), %edx -; CHECK-NEXT: movl %edx, %esi -; CHECK-NEXT: shrl $16, %esi -; CHECK-NEXT: shldw $1, %dx, %si -; CHECK-NEXT: sarl $14, %eax -; CHECK-NEXT: cmpl $16384, %eax # imm = 0x4000 -; CHECK-NEXT: cmovgel %r8d, %esi -; CHECK-NEXT: cmpl $-16384, %eax # imm = 0xC000 -; CHECK-NEXT: cmovll %ecx, %esi -; CHECK-NEXT: pinsrw $3, %esi, %xmm1 +; CHECK-NEXT: pextrw $3, %xmm0, %edx +; CHECK-NEXT: movswl %dx, %edx +; CHECK-NEXT: leal (,%rdx,4), %esi +; CHECK-NEXT: movl %esi, %edi +; CHECK-NEXT: shrl $16, %edi +; CHECK-NEXT: shldw $1, %si, %di +; CHECK-NEXT: sarl $14, %edx +; CHECK-NEXT: cmpl $16384, %edx # imm = 0x4000 +; CHECK-NEXT: cmovgel %eax, %edi +; CHECK-NEXT: cmpl $-16384, %edx # imm = 0xC000 +; CHECK-NEXT: cmovll %ecx, %edi +; CHECK-NEXT: pinsrw $3, %edi, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %t = call <4 x i16> @llvm.smul.fix.sat.v4i16(<4 x i16> , <4 x i16> %a, i32 15) diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll @@ -201,45 +201,45 @@ define i64 @test_v16i64_v16i8(<16 x i8> %a0) { ; SSE2-LABEL: test_v16i64_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pxor %xmm12, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm12 -; SSE2-NEXT: movdqa %xmm1, %xmm11 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-NEXT: movdqa %xmm0, %xmm13 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE2-NEXT: movdqa %xmm4, %xmm10 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $24, %xmm6 -; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE2-NEXT: psrad $24, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; SSE2-NEXT: paddq %xmm5, %xmm4 -; SSE2-NEXT: paddq %xmm11, %xmm4 -; SSE2-NEXT: paddq %xmm13, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE2-NEXT: movdqa %xmm6, %xmm11 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; SSE2-NEXT: paddq %xmm10, %xmm11 +; SSE2-NEXT: paddq %xmm5, %xmm11 +; SSE2-NEXT: paddq %xmm8, %xmm11 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] ; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; SSE2-NEXT: paddq %xmm6, %xmm3 -; SSE2-NEXT: paddq %xmm1, %xmm3 -; SSE2-NEXT: paddq %xmm4, %xmm3 -; SSE2-NEXT: paddq %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE2-NEXT: paddq %xmm3, %xmm0 +; SSE2-NEXT: paddq %xmm4, %xmm6 +; SSE2-NEXT: paddq %xmm1, %xmm6 +; SSE2-NEXT: paddq %xmm11, %xmm6 +; SSE2-NEXT: paddq %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE2-NEXT: paddq %xmm6, %xmm0 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; @@ -1976,11 +1976,11 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm8 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm6 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm4, %xmm7 -; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm6 -; AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm8 +; AVX1-NEXT: vpaddb %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpaddb %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -1993,7 +1993,7 @@ ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll @@ -302,24 +302,24 @@ ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm0, %xmm7, %xmm2 +; AVX-NEXT: vmaxss %xmm0, %xmm7, %xmm8 ; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendvps %xmm0, %xmm7, %xmm2, %xmm0 -; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm0, %xmm7, %xmm8, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm7 ; AVX-NEXT: vmaxss %xmm0, %xmm6, %xmm0 -; AVX-NEXT: vblendvps %xmm2, %xmm6, %xmm0, %xmm0 -; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm7, %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm6 ; AVX-NEXT: vmaxss %xmm0, %xmm5, %xmm0 -; AVX-NEXT: vblendvps %xmm2, %xmm5, %xmm0, %xmm0 -; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm6, %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm5 ; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm5, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 ; AVX-NEXT: vmaxss %xmm0, %xmm4, %xmm0 ; AVX-NEXT: vblendvps %xmm1, %xmm4, %xmm0, %xmm0 @@ -327,25 +327,25 @@ ; AVX-NEXT: vmaxss %xmm0, %xmm3, %xmm0 ; AVX-NEXT: vblendvps %xmm1, %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 -; AVX-NEXT: vmaxss %xmm0, %xmm8, %xmm0 -; AVX-NEXT: vblendvps %xmm1, %xmm8, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512BW-LABEL: test_v8f32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3] ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] ; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] -; AVX512BW-NEXT: vmaxss %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm7, %xmm8 ; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmovss %xmm7, %xmm1, %xmm1 {%k1} -; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1 -; AVX512BW-NEXT: vmaxss %xmm1, %xmm6, %xmm0 +; AVX512BW-NEXT: vmovss %xmm7, %xmm8, %xmm8 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm8, %xmm8, %k1 +; AVX512BW-NEXT: vmaxss %xmm8, %xmm6, %xmm0 ; AVX512BW-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} ; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512BW-NEXT: vmaxss %xmm0, %xmm5, %xmm0 @@ -360,25 +360,25 @@ ; AVX512BW-NEXT: vmaxss %xmm0, %xmm2, %xmm0 ; AVX512BW-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} ; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmaxss %xmm0, %xmm8, %xmm0 -; AVX512BW-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: test_v8f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm8 ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm2, %xmm2, %k1 -; AVX512VL-NEXT: vmaxss %xmm2, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm7, %xmm8, %xmm8 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm8, %xmm8, %k1 +; AVX512VL-NEXT: vmaxss %xmm8, %xmm6, %xmm0 ; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm0 @@ -393,8 +393,8 @@ ; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0 ; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm8, %xmm0 -; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %1 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a0) @@ -596,65 +596,65 @@ ; AVX512VL-LABEL: test_v16f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm9 = xmm3[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm10 = xmm3[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] ; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm11 = xmm6[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm12 = xmm6[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm6[1,1,3,3] -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm14 = xmm2[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm15 = xmm2[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm2[1,1,3,3] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm7 -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmovss %xmm5, %xmm7, %xmm7 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm7, %xmm7, %k1 -; AVX512VL-NEXT: vmaxss %xmm7, %xmm4, %xmm0 -; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm6[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm6[1,1,3,3] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm9 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm12 = xmm9[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm14 = xmm0[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vmaxss %xmm0, %xmm15, %xmm16 ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm2, %xmm0 -; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm16, %xmm0 -; AVX512VL-NEXT: vmovss %xmm16, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm15, %xmm0 -; AVX512VL-NEXT: vmovss %xmm15, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm14, %xmm0 +; AVX512VL-NEXT: vmovss %xmm15, %xmm16, %xmm16 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm16, %xmm16, %k1 +; AVX512VL-NEXT: vmaxss %xmm16, %xmm14, %xmm0 ; AVX512VL-NEXT: vmovss %xmm14, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm6, %xmm0 -; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmaxss %xmm0, %xmm13, %xmm0 ; AVX512VL-NEXT: vmovss %xmm13, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm9, %xmm0 +; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmaxss %xmm0, %xmm12, %xmm0 ; AVX512VL-NEXT: vmovss %xmm12, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmaxss %xmm0, %xmm11, %xmm0 ; AVX512VL-NEXT: vmovss %xmm11, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0 -; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmaxss %xmm0, %xmm10, %xmm0 ; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm9, %xmm0 -; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vmaxss %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmaxss %xmm0, %xmm8, %xmm0 ; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm0 +; AVX512VL-NEXT: vmovss %xmm7, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %1 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0) @@ -856,17 +856,17 @@ ; AVX512BW-LABEL: test_v8f64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm8 = xmm2[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] ; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] -; AVX512BW-NEXT: vmaxsd %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm7, %xmm8 ; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmovsd %xmm7, %xmm1, %xmm1 {%k1} -; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 -; AVX512BW-NEXT: vmaxsd %xmm1, %xmm5, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm8, %xmm8, %k1 +; AVX512BW-NEXT: vmaxsd %xmm8, %xmm5, %xmm0 ; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} ; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 ; AVX512BW-NEXT: vmaxsd %xmm0, %xmm6, %xmm0 @@ -881,25 +881,25 @@ ; AVX512BW-NEXT: vmaxsd %xmm0, %xmm2, %xmm0 ; AVX512BW-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmaxsd %xmm0, %xmm8, %xmm0 -; AVX512BW-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: test_v8f64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] -; AVX512VL-NEXT: vmaxsd %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm7, %xmm8 ; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmovsd %xmm7, %xmm2, %xmm2 {%k1} -; AVX512VL-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 -; AVX512VL-NEXT: vmaxsd %xmm2, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm8, %xmm8, %k1 +; AVX512VL-NEXT: vmaxsd %xmm8, %xmm5, %xmm0 ; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmaxsd %xmm0, %xmm6, %xmm0 @@ -914,8 +914,8 @@ ; AVX512VL-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 ; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxsd %xmm0, %xmm8, %xmm0 -; AVX512VL-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %1 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a0) @@ -980,41 +980,40 @@ ; ; SSE41-LABEL: test_v16f64: ; SSE41: # %bb.0: -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: movapd %xmm4, %xmm3 -; SSE41-NEXT: maxpd %xmm0, %xmm3 +; SSE41-NEXT: movapd %xmm4, %xmm8 +; SSE41-NEXT: maxpd %xmm0, %xmm8 ; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 ; SSE41-NEXT: movapd %xmm6, %xmm4 ; SSE41-NEXT: maxpd %xmm2, %xmm4 ; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4 ; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: maxpd %xmm3, %xmm2 -; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: maxpd %xmm8, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm8, %xmm8 +; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movapd %xmm5, %xmm3 -; SSE41-NEXT: maxpd %xmm1, %xmm3 +; SSE41-NEXT: movapd %xmm5, %xmm4 +; SSE41-NEXT: maxpd %xmm1, %xmm4 ; SSE41-NEXT: cmpunordpd %xmm1, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 ; SSE41-NEXT: movapd %xmm7, %xmm1 -; SSE41-NEXT: maxpd %xmm8, %xmm1 -; SSE41-NEXT: cmpunordpd %xmm8, %xmm8 -; SSE41-NEXT: movapd %xmm8, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: maxpd %xmm3, %xmm4 +; SSE41-NEXT: maxpd %xmm3, %xmm1 ; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 ; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: maxpd %xmm4, %xmm3 +; SSE41-NEXT: cmpunordpd %xmm4, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm1 ; SSE41-NEXT: maxpd %xmm2, %xmm1 ; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE41-NEXT: movapd %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll @@ -235,24 +235,24 @@ ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm0, %xmm7, %xmm2 +; AVX-NEXT: vminss %xmm0, %xmm7, %xmm8 ; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendvps %xmm0, %xmm7, %xmm2, %xmm0 -; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm0, %xmm7, %xmm8, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm7 ; AVX-NEXT: vminss %xmm0, %xmm6, %xmm0 -; AVX-NEXT: vblendvps %xmm2, %xmm6, %xmm0, %xmm0 -; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm7, %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm6 ; AVX-NEXT: vminss %xmm0, %xmm5, %xmm0 -; AVX-NEXT: vblendvps %xmm2, %xmm5, %xmm0, %xmm0 -; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm6, %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm5 ; AVX-NEXT: vminss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm5, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 ; AVX-NEXT: vminss %xmm0, %xmm4, %xmm0 ; AVX-NEXT: vblendvps %xmm1, %xmm4, %xmm0, %xmm0 @@ -260,25 +260,25 @@ ; AVX-NEXT: vminss %xmm0, %xmm3, %xmm0 ; AVX-NEXT: vblendvps %xmm1, %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 -; AVX-NEXT: vminss %xmm0, %xmm8, %xmm0 -; AVX-NEXT: vblendvps %xmm1, %xmm8, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512BW-LABEL: test_v8f32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3] ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] ; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] -; AVX512BW-NEXT: vminss %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vminss %xmm0, %xmm7, %xmm8 ; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmovss %xmm7, %xmm1, %xmm1 {%k1} -; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1 -; AVX512BW-NEXT: vminss %xmm1, %xmm6, %xmm0 +; AVX512BW-NEXT: vmovss %xmm7, %xmm8, %xmm8 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm8, %xmm8, %k1 +; AVX512BW-NEXT: vminss %xmm8, %xmm6, %xmm0 ; AVX512BW-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} ; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512BW-NEXT: vminss %xmm0, %xmm5, %xmm0 @@ -293,25 +293,25 @@ ; AVX512BW-NEXT: vminss %xmm0, %xmm2, %xmm0 ; AVX512BW-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} ; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vminss %xmm0, %xmm8, %xmm0 -; AVX512BW-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: test_v8f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm8 ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm2, %xmm2, %k1 -; AVX512VL-NEXT: vminss %xmm2, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm7, %xmm8, %xmm8 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm8, %xmm8, %k1 +; AVX512VL-NEXT: vminss %xmm8, %xmm6, %xmm0 ; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm0 @@ -326,8 +326,8 @@ ; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0 ; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm8, %xmm0 -; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vminss %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %1 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a0) @@ -529,65 +529,65 @@ ; AVX512VL-LABEL: test_v16f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm9 = xmm3[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm10 = xmm3[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] ; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm11 = xmm6[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm12 = xmm6[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm6[1,1,3,3] -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm14 = xmm2[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm15 = xmm2[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm2[1,1,3,3] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm7 -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmovss %xmm5, %xmm7, %xmm7 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm7, %xmm7, %k1 -; AVX512VL-NEXT: vminss %xmm7, %xmm4, %xmm0 -; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm6[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm6[1,1,3,3] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm9 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm12 = xmm9[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm14 = xmm0[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vminss %xmm0, %xmm15, %xmm16 ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm2, %xmm0 -; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm16, %xmm0 -; AVX512VL-NEXT: vmovss %xmm16, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm15, %xmm0 -; AVX512VL-NEXT: vmovss %xmm15, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm14, %xmm0 +; AVX512VL-NEXT: vmovss %xmm15, %xmm16, %xmm16 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm16, %xmm16, %k1 +; AVX512VL-NEXT: vminss %xmm16, %xmm14, %xmm0 ; AVX512VL-NEXT: vmovss %xmm14, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm6, %xmm0 -; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vminss %xmm0, %xmm13, %xmm0 ; AVX512VL-NEXT: vmovss %xmm13, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm9, %xmm0 +; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vminss %xmm0, %xmm12, %xmm0 ; AVX512VL-NEXT: vmovss %xmm12, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vminss %xmm0, %xmm11, %xmm0 ; AVX512VL-NEXT: vmovss %xmm11, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0 -; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vminss %xmm0, %xmm10, %xmm0 ; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm9, %xmm0 -; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vminss %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vminss %xmm0, %xmm8, %xmm0 ; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm0 +; AVX512VL-NEXT: vmovss %xmm7, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %1 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a0) @@ -859,17 +859,17 @@ ; AVX512BW-LABEL: test_v8f64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm8 = xmm2[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] ; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] -; AVX512BW-NEXT: vminsd %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm7, %xmm8 ; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vmovsd %xmm7, %xmm1, %xmm1 {%k1} -; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 -; AVX512BW-NEXT: vminsd %xmm1, %xmm5, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm8, %xmm8, %k1 +; AVX512BW-NEXT: vminsd %xmm8, %xmm5, %xmm0 ; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} ; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 ; AVX512BW-NEXT: vminsd %xmm0, %xmm6, %xmm0 @@ -884,25 +884,25 @@ ; AVX512BW-NEXT: vminsd %xmm0, %xmm2, %xmm0 ; AVX512BW-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512BW-NEXT: vminsd %xmm0, %xmm8, %xmm0 -; AVX512BW-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vminsd %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: test_v8f64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] -; AVX512VL-NEXT: vminsd %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vminsd %xmm0, %xmm7, %xmm8 ; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmovsd %xmm7, %xmm2, %xmm2 {%k1} -; AVX512VL-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 -; AVX512VL-NEXT: vminsd %xmm2, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm8, %xmm8, %k1 +; AVX512VL-NEXT: vminsd %xmm8, %xmm5, %xmm0 ; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vminsd %xmm0, %xmm6, %xmm0 @@ -917,8 +917,8 @@ ; AVX512VL-NEXT: vminsd %xmm0, %xmm1, %xmm0 ; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminsd %xmm0, %xmm8, %xmm0 -; AVX512VL-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vminsd %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %1 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a0) @@ -983,41 +983,40 @@ ; ; SSE41-LABEL: test_v16f64: ; SSE41: # %bb.0: -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: movapd %xmm4, %xmm3 -; SSE41-NEXT: minpd %xmm0, %xmm3 +; SSE41-NEXT: movapd %xmm4, %xmm8 +; SSE41-NEXT: minpd %xmm0, %xmm8 ; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 ; SSE41-NEXT: movapd %xmm6, %xmm4 ; SSE41-NEXT: minpd %xmm2, %xmm4 ; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4 ; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: minpd %xmm3, %xmm2 -; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: minpd %xmm8, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm8, %xmm8 +; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movapd %xmm5, %xmm3 -; SSE41-NEXT: minpd %xmm1, %xmm3 +; SSE41-NEXT: movapd %xmm5, %xmm4 +; SSE41-NEXT: minpd %xmm1, %xmm4 ; SSE41-NEXT: cmpunordpd %xmm1, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 ; SSE41-NEXT: movapd %xmm7, %xmm1 -; SSE41-NEXT: minpd %xmm8, %xmm1 -; SSE41-NEXT: cmpunordpd %xmm8, %xmm8 -; SSE41-NEXT: movapd %xmm8, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: minpd %xmm3, %xmm4 +; SSE41-NEXT: minpd %xmm3, %xmm1 ; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 ; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: minpd %xmm4, %xmm3 +; SSE41-NEXT: cmpunordpd %xmm4, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm1 ; SSE41-NEXT: minpd %xmm2, %xmm1 ; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE41-NEXT: movapd %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -460,16 +460,16 @@ ; SSE-NEXT: psllq $32, %xmm9 ; SSE-NEXT: pmuludq %xmm6, %xmm2 ; SSE-NEXT: paddq %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm0, %xmm6 -; SSE-NEXT: paddq %xmm8, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: pmuludq %xmm0, %xmm8 +; SSE-NEXT: paddq %xmm6, %xmm8 +; SSE-NEXT: psllq $32, %xmm8 ; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: paddq %xmm6, %xmm0 +; SSE-NEXT: paddq %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrlq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm7, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -738,33 +738,33 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm8 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm6, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm7, %xmm5, %xmm8 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm7, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8 +; AVX1-NEXT: vpxor %xmm4, %xmm8, %xmm9 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm9, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm8, %xmm6, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm2 ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vxorpd %xmm4, %xmm5, %xmm1 -; AVX1-NEXT: vxorpd %xmm4, %xmm8, %xmm2 +; AVX1-NEXT: vxorpd %xmm4, %xmm6, %xmm1 +; AVX1-NEXT: vxorpd %xmm4, %xmm5, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm8, %xmm5, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm5, %xmm6, %xmm1 ; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm2 ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -744,7 +744,7 @@ ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm6 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm8 +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm5 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm6 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm7 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6 @@ -752,23 +752,23 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vxorpd %xmm4, %xmm6, %xmm1 -; AVX1-NEXT: vxorpd %xmm4, %xmm8, %xmm2 +; AVX1-NEXT: vxorpd %xmm4, %xmm5, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm6, %xmm8, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm6, %xmm5, %xmm1 ; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm2 ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -342,10 +342,10 @@ ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm7 -; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm8 +; AVX1-NEXT: vpor %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 @@ -363,7 +363,7 @@ ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll --- a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll +++ b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll @@ -17,105 +17,105 @@ ; SSE-NEXT: testl %edx, %edx ; SSE-NEXT: jle .LBB0_9 ; SSE-NEXT: # %bb.1: # %for.body.preheader -; SSE-NEXT: movl %ecx, %r9d -; SSE-NEXT: movl %edx, %eax +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: movl %edx, %r9d ; SSE-NEXT: cmpl $31, %edx ; SSE-NEXT: ja .LBB0_3 ; SSE-NEXT: # %bb.2: ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: jmp .LBB0_6 ; SSE-NEXT: .LBB0_3: # %vector.ph -; SSE-NEXT: movl %eax, %edx +; SSE-NEXT: movl %r9d, %edx ; SSE-NEXT: andl $-32, %edx -; SSE-NEXT: movd %r9d, %xmm0 -; SSE-NEXT: movd %r8d, %xmm2 +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %r8d, %xmm1 ; SSE-NEXT: xorl %ecx, %ecx -; SSE-NEXT: pmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero -; SSE-NEXT: pmovzxdq {{.*#+}} xmm15 = xmm2[0],zero,xmm2[1],zero +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: pmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero +; SSE-NEXT: pmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero ; SSE-NEXT: .p2align 4, 0x90 ; SSE-NEXT: .LBB0_4: # %vector.body ; SSE-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm11 = mem[0],zero +; SSE-NEXT: pcmpeqb %xmm8, %xmm0 ; SSE-NEXT: pmovsxbd %xmm0, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE-NEXT: pcmpeqb %xmm1, %xmm3 -; SSE-NEXT: pmovsxbd %xmm3, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] -; SSE-NEXT: pmovsxbd %xmm3, %xmm6 -; SSE-NEXT: pcmpeqb %xmm1, %xmm4 -; SSE-NEXT: pmovsxbd %xmm4, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] -; SSE-NEXT: pmovsxbd %xmm3, %xmm2 -; SSE-NEXT: pcmpeqb %xmm1, %xmm5 -; SSE-NEXT: pmovsxbd %xmm5, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; SSE-NEXT: pmovsxbd %xmm3, %xmm10 -; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pslld %xmm14, %xmm4 -; SSE-NEXT: pslld %xmm15, %xmm3 -; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm3 -; SSE-NEXT: movdqu (%rdi,%rcx,4), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pslld %xmm14, %xmm5 -; SSE-NEXT: pslld %xmm15, %xmm8 +; SSE-NEXT: pcmpeqb %xmm8, %xmm1 +; SSE-NEXT: pmovsxbd %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: pmovsxbd %xmm1, %xmm6 +; SSE-NEXT: pcmpeqb %xmm8, %xmm2 +; SSE-NEXT: pmovsxbd %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: pmovsxbd %xmm1, %xmm4 +; SSE-NEXT: pcmpeqb %xmm8, %xmm11 +; SSE-NEXT: pmovsxbd %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] +; SSE-NEXT: pmovsxbd %xmm2, %xmm2 +; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: pslld %xmm9, %xmm12 +; SSE-NEXT: pslld %xmm10, %xmm11 +; SSE-NEXT: blendvps %xmm0, %xmm12, %xmm11 +; SSE-NEXT: movdqu (%rdi,%rcx,4), %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: pslld %xmm9, %xmm13 +; SSE-NEXT: pslld %xmm10, %xmm12 ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm8 -; SSE-NEXT: movdqu 48(%rdi,%rcx,4), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pslld %xmm14, %xmm5 -; SSE-NEXT: pslld %xmm15, %xmm12 +; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm12 +; SSE-NEXT: movdqu 48(%rdi,%rcx,4), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: pslld %xmm9, %xmm13 +; SSE-NEXT: pslld %xmm10, %xmm7 ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm12 +; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm7 ; SSE-NEXT: movdqu 32(%rdi,%rcx,4), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pslld %xmm14, %xmm5 -; SSE-NEXT: pslld %xmm15, %xmm6 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm6 -; SSE-NEXT: movdqu 80(%rdi,%rcx,4), %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pslld %xmm14, %xmm5 -; SSE-NEXT: pslld %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: pslld %xmm9, %xmm13 +; SSE-NEXT: pslld %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm6 +; SSE-NEXT: movdqu 80(%rdi,%rcx,4), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: pslld %xmm9, %xmm13 +; SSE-NEXT: pslld %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm5 +; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: pslld %xmm9, %xmm13 +; SSE-NEXT: pslld %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm4 +; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pslld %xmm9, %xmm13 +; SSE-NEXT: pslld %xmm10, %xmm3 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm1 -; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pslld %xmm14, %xmm2 -; SSE-NEXT: pslld %xmm15, %xmm5 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm5 -; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pslld %xmm14, %xmm4 -; SSE-NEXT: pslld %xmm15, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm2 -; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: pslld %xmm14, %xmm7 -; SSE-NEXT: pslld %xmm15, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm7, %xmm4 -; SSE-NEXT: movups %xmm8, (%rdi,%rcx,4) -; SSE-NEXT: movups %xmm3, 16(%rdi,%rcx,4) +; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm3 +; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pslld %xmm9, %xmm13 +; SSE-NEXT: pslld %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm2 +; SSE-NEXT: movups %xmm12, (%rdi,%rcx,4) +; SSE-NEXT: movups %xmm11, 16(%rdi,%rcx,4) ; SSE-NEXT: movups %xmm6, 32(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm12, 48(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm5, 64(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm1, 80(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm4, 96(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm2, 112(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm7, 48(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm4, 64(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm5, 80(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm2, 96(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm3, 112(%rdi,%rcx,4) ; SSE-NEXT: addq $32, %rcx ; SSE-NEXT: cmpq %rcx, %rdx ; SSE-NEXT: jne .LBB0_4 ; SSE-NEXT: # %bb.5: # %middle.block -; SSE-NEXT: cmpq %rax, %rdx +; SSE-NEXT: cmpq %r9, %rdx ; SSE-NEXT: jne .LBB0_6 ; SSE-NEXT: .LBB0_9: # %for.cond.cleanup ; SSE-NEXT: retq @@ -125,12 +125,12 @@ ; SSE-NEXT: # kill: def $cl killed $cl killed $ecx ; SSE-NEXT: shll %cl, (%rdi,%rdx,4) ; SSE-NEXT: incq %rdx -; SSE-NEXT: cmpq %rdx, %rax +; SSE-NEXT: cmpq %rdx, %r9 ; SSE-NEXT: je .LBB0_9 ; SSE-NEXT: .LBB0_6: # %for.body ; SSE-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE-NEXT: cmpb $0, (%rsi,%rdx) -; SSE-NEXT: movl %r9d, %ecx +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: je .LBB0_8 ; SSE-NEXT: # %bb.7: # %for.body ; SSE-NEXT: # in Loop: Header=BB0_6 Depth=1 @@ -142,104 +142,102 @@ ; AVX1-NEXT: testl %edx, %edx ; AVX1-NEXT: jle .LBB0_9 ; AVX1-NEXT: # %bb.1: # %for.body.preheader -; AVX1-NEXT: movl %ecx, %r9d -; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: movl %edx, %r9d ; AVX1-NEXT: cmpl $31, %edx ; AVX1-NEXT: ja .LBB0_3 ; AVX1-NEXT: # %bb.2: ; AVX1-NEXT: xorl %edx, %edx ; AVX1-NEXT: jmp .LBB0_6 ; AVX1-NEXT: .LBB0_3: # %vector.ph -; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: movl %r9d, %edx ; AVX1-NEXT: andl $-32, %edx -; AVX1-NEXT: vmovd %r9d, %xmm0 +; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vmovd %r8d, %xmm1 ; AVX1-NEXT: xorl %ecx, %ecx ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB0_4: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm12, %xmm1 -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm12, %xmm2 -; AVX1-NEXT: vpmovsxbd %xmm2, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm12, %xmm3 -; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-NEXT: # xmm7 = mem[0],zero,mem[1],zero -; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm8 -; AVX1-NEXT: vpslld %xmm7, %xmm8, %xmm9 -; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-NEXT: # xmm10 = mem[0],zero,mem[1],zero -; AVX1-NEXT: vpslld %xmm10, %xmm8, %xmm0 -; AVX1-NEXT: vblendvps %xmm5, %xmm9, %xmm0, %xmm8 -; AVX1-NEXT: vpmovsxbd %xmm3, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm4, %xmm12, %xmm4 -; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm0 -; AVX1-NEXT: vpslld %xmm7, %xmm0, %xmm7 -; AVX1-NEXT: vpslld %xmm10, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm9 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm12 -; AVX1-NEXT: vblendvps %xmm1, %xmm7, %xmm0, %xmm10 -; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm1 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpslld %xmm0, %xmm1, %xmm7 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpslld %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vblendvps %xmm6, %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu 48(%rdi,%rcx,4), %xmm6 -; AVX1-NEXT: vpslld %xmm0, %xmm6, %xmm7 -; AVX1-NEXT: vpslld %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vblendvps %xmm2, %xmm7, %xmm6, %xmm2 -; AVX1-NEXT: vmovdqu 64(%rdi,%rcx,4), %xmm6 -; AVX1-NEXT: vpslld %xmm13, %xmm6, %xmm7 -; AVX1-NEXT: vpslld %xmm14, %xmm6, %xmm6 -; AVX1-NEXT: vblendvps %xmm5, %xmm7, %xmm6, %xmm5 -; AVX1-NEXT: vmovdqu 80(%rdi,%rcx,4), %xmm6 -; AVX1-NEXT: vpslld %xmm13, %xmm6, %xmm7 -; AVX1-NEXT: vpslld %xmm14, %xmm6, %xmm6 -; AVX1-NEXT: vblendvps %xmm3, %xmm7, %xmm6, %xmm3 -; AVX1-NEXT: vmovdqu 96(%rdi,%rcx,4), %xmm6 -; AVX1-NEXT: vpslld %xmm15, %xmm6, %xmm7 -; AVX1-NEXT: vpslld %xmm11, %xmm6, %xmm6 -; AVX1-NEXT: vblendvps %xmm9, %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqu 112(%rdi,%rcx,4), %xmm7 -; AVX1-NEXT: vpslld %xmm15, %xmm7, %xmm0 -; AVX1-NEXT: vpslld %xmm11, %xmm7, %xmm7 -; AVX1-NEXT: vblendvps %xmm12, %xmm0, %xmm7, %xmm0 -; AVX1-NEXT: vmovups %xmm8, (%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm10, 16(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm1, 32(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm2, 48(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm5, 64(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm3, 80(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm6, 96(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm0, 112(%rdi,%rcx,4) +; AVX1-NEXT: vmovq {{.*#+}} xmm9 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm10 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm11 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm12 = mem[0],zero +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm9, %xmm9 +; AVX1-NEXT: vpmovsxbd %xmm9, %xmm13 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm9, %xmm9 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm10, %xmm10 +; AVX1-NEXT: vpmovsxbd %xmm10, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm10, %xmm10 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm11, %xmm11 +; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-NEXT: # xmm15 = mem[0],zero,mem[1],zero +; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm0 +; AVX1-NEXT: vpslld %xmm15, %xmm0, %xmm1 +; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: # xmm2 = mem[0],zero,mem[1],zero +; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm13, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbd %xmm11, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm11, %xmm11 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm12, %xmm12 +; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm13 +; AVX1-NEXT: vpslld %xmm15, %xmm13, %xmm15 +; AVX1-NEXT: vpslld %xmm2, %xmm13, %xmm2 +; AVX1-NEXT: vpmovsxbd %xmm12, %xmm13 +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm12, %xmm12 +; AVX1-NEXT: vblendvps %xmm9, %xmm15, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm9 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpslld %xmm3, %xmm9, %xmm15 +; AVX1-NEXT: vpslld %xmm4, %xmm9, %xmm9 +; AVX1-NEXT: vblendvps %xmm14, %xmm15, %xmm9, %xmm9 +; AVX1-NEXT: vmovdqu 48(%rdi,%rcx,4), %xmm14 +; AVX1-NEXT: vpslld %xmm3, %xmm14, %xmm15 +; AVX1-NEXT: vpslld %xmm4, %xmm14, %xmm14 +; AVX1-NEXT: vblendvps %xmm10, %xmm15, %xmm14, %xmm10 +; AVX1-NEXT: vmovdqu 64(%rdi,%rcx,4), %xmm14 +; AVX1-NEXT: vpslld %xmm5, %xmm14, %xmm15 +; AVX1-NEXT: vpslld %xmm6, %xmm14, %xmm14 +; AVX1-NEXT: vblendvps %xmm1, %xmm15, %xmm14, %xmm1 +; AVX1-NEXT: vmovdqu 80(%rdi,%rcx,4), %xmm14 +; AVX1-NEXT: vpslld %xmm5, %xmm14, %xmm15 +; AVX1-NEXT: vpslld %xmm6, %xmm14, %xmm14 +; AVX1-NEXT: vblendvps %xmm11, %xmm15, %xmm14, %xmm11 +; AVX1-NEXT: vmovdqu 96(%rdi,%rcx,4), %xmm14 +; AVX1-NEXT: vpslld %xmm7, %xmm14, %xmm15 +; AVX1-NEXT: vpslld %xmm8, %xmm14, %xmm14 +; AVX1-NEXT: vblendvps %xmm13, %xmm15, %xmm14, %xmm13 +; AVX1-NEXT: vmovdqu 112(%rdi,%rcx,4), %xmm14 +; AVX1-NEXT: vpslld %xmm7, %xmm14, %xmm15 +; AVX1-NEXT: vpslld %xmm8, %xmm14, %xmm14 +; AVX1-NEXT: vblendvps %xmm12, %xmm15, %xmm14, %xmm12 +; AVX1-NEXT: vmovups %xmm0, (%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm2, 16(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm9, 32(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm10, 48(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm1, 64(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm11, 80(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm13, 96(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm12, 112(%rdi,%rcx,4) ; AVX1-NEXT: addq $32, %rcx ; AVX1-NEXT: cmpq %rcx, %rdx ; AVX1-NEXT: jne .LBB0_4 ; AVX1-NEXT: # %bb.5: # %middle.block -; AVX1-NEXT: cmpq %rax, %rdx +; AVX1-NEXT: cmpq %r9, %rdx ; AVX1-NEXT: jne .LBB0_6 ; AVX1-NEXT: .LBB0_9: # %for.cond.cleanup ; AVX1-NEXT: vzeroupper @@ -250,12 +248,12 @@ ; AVX1-NEXT: # kill: def $cl killed $cl killed $ecx ; AVX1-NEXT: shll %cl, (%rdi,%rdx,4) ; AVX1-NEXT: incq %rdx -; AVX1-NEXT: cmpq %rdx, %rax +; AVX1-NEXT: cmpq %rdx, %r9 ; AVX1-NEXT: je .LBB0_9 ; AVX1-NEXT: .LBB0_6: # %for.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX1-NEXT: cmpb $0, (%rsi,%rdx) -; AVX1-NEXT: movl %r9d, %ecx +; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: je .LBB0_8 ; AVX1-NEXT: # %bb.7: # %for.body ; AVX1-NEXT: # in Loop: Header=BB0_6 Depth=1 @@ -267,17 +265,17 @@ ; AVX2-NEXT: testl %edx, %edx ; AVX2-NEXT: jle .LBB0_9 ; AVX2-NEXT: # %bb.1: # %for.body.preheader -; AVX2-NEXT: movl %ecx, %r9d -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movl %edx, %r9d ; AVX2-NEXT: cmpl $31, %edx ; AVX2-NEXT: ja .LBB0_3 ; AVX2-NEXT: # %bb.2: ; AVX2-NEXT: xorl %edx, %edx ; AVX2-NEXT: jmp .LBB0_6 ; AVX2-NEXT: .LBB0_3: # %vector.ph -; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: movl %r9d, %edx ; AVX2-NEXT: andl $-32, %edx -; AVX2-NEXT: vmovd %r9d, %xmm0 +; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX2-NEXT: vmovd %r8d, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 @@ -314,7 +312,7 @@ ; AVX2-NEXT: cmpq %rcx, %rdx ; AVX2-NEXT: jne .LBB0_4 ; AVX2-NEXT: # %bb.5: # %middle.block -; AVX2-NEXT: cmpq %rax, %rdx +; AVX2-NEXT: cmpq %r9, %rdx ; AVX2-NEXT: jne .LBB0_6 ; AVX2-NEXT: .LBB0_9: # %for.cond.cleanup ; AVX2-NEXT: vzeroupper @@ -325,12 +323,12 @@ ; AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; AVX2-NEXT: shll %cl, (%rdi,%rdx,4) ; AVX2-NEXT: incq %rdx -; AVX2-NEXT: cmpq %rdx, %rax +; AVX2-NEXT: cmpq %rdx, %r9 ; AVX2-NEXT: je .LBB0_9 ; AVX2-NEXT: .LBB0_6: # %for.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX2-NEXT: cmpb $0, (%rsi,%rdx) -; AVX2-NEXT: movl %r9d, %ecx +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: je .LBB0_8 ; AVX2-NEXT: # %bb.7: # %for.body ; AVX2-NEXT: # in Loop: Header=BB0_6 Depth=1 @@ -342,78 +340,78 @@ ; XOP-NEXT: testl %edx, %edx ; XOP-NEXT: jle .LBB0_9 ; XOP-NEXT: # %bb.1: # %for.body.preheader -; XOP-NEXT: movl %ecx, %r9d -; XOP-NEXT: movl %edx, %eax +; XOP-NEXT: movl %ecx, %eax +; XOP-NEXT: movl %edx, %r9d ; XOP-NEXT: cmpl $31, %edx ; XOP-NEXT: ja .LBB0_3 ; XOP-NEXT: # %bb.2: ; XOP-NEXT: xorl %edx, %edx ; XOP-NEXT: jmp .LBB0_6 ; XOP-NEXT: .LBB0_3: # %vector.ph -; XOP-NEXT: movl %eax, %edx +; XOP-NEXT: movl %r9d, %edx ; XOP-NEXT: andl $-32, %edx -; XOP-NEXT: vmovd %r9d, %xmm0 +; XOP-NEXT: vmovd %eax, %xmm0 ; XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm9 +; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOP-NEXT: vmovd %r8d, %xmm1 ; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm14 +; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; XOP-NEXT: xorl %ecx, %ecx -; XOP-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; XOP-NEXT: vextractf128 $1, %ymm9, %xmm15 -; XOP-NEXT: vextractf128 $1, %ymm14, %xmm4 +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm4 ; XOP-NEXT: .p2align 4, 0x90 ; XOP-NEXT: .LBB0_4: # %vector.body ; XOP-NEXT: # =>This Inner Loop Header: Depth=1 ; XOP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; XOP-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero ; XOP-NEXT: vmovq {{.*#+}} xmm7 = mem[0],zero -; XOP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; XOP-NEXT: vpcomeqb %xmm8, %xmm5, %xmm5 -; XOP-NEXT: vpmovsxbd %xmm5, %xmm0 +; XOP-NEXT: vmovq {{.*#+}} xmm8 = mem[0],zero +; XOP-NEXT: vpcomeqb %xmm2, %xmm5, %xmm5 +; XOP-NEXT: vpmovsxbd %xmm5, %xmm9 ; XOP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] ; XOP-NEXT: vpmovsxbd %xmm5, %xmm5 -; XOP-NEXT: vpcomeqb %xmm8, %xmm6, %xmm6 +; XOP-NEXT: vpcomeqb %xmm2, %xmm6, %xmm6 ; XOP-NEXT: vpmovsxbd %xmm6, %xmm10 ; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] ; XOP-NEXT: vpmovsxbd %xmm6, %xmm6 -; XOP-NEXT: vpcomeqb %xmm8, %xmm7, %xmm7 +; XOP-NEXT: vpcomeqb %xmm2, %xmm7, %xmm7 ; XOP-NEXT: vpmovsxbd %xmm7, %xmm11 ; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] ; XOP-NEXT: vpmovsxbd %xmm7, %xmm7 -; XOP-NEXT: vpcomeqb %xmm8, %xmm2, %xmm2 -; XOP-NEXT: vpmovsxbd %xmm2, %xmm12 -; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; XOP-NEXT: vpmovsxbd %xmm2, %xmm2 -; XOP-NEXT: vblendvps %xmm5, %xmm15, %xmm4, %xmm5 -; XOP-NEXT: vpshld %xmm5, 16(%rdi,%rcx,4), %xmm13 -; XOP-NEXT: vblendvps %xmm0, %xmm9, %xmm14, %xmm0 -; XOP-NEXT: vpshld %xmm0, (%rdi,%rcx,4), %xmm0 -; XOP-NEXT: vblendvps %xmm6, %xmm15, %xmm4, %xmm6 +; XOP-NEXT: vpcomeqb %xmm2, %xmm8, %xmm8 +; XOP-NEXT: vpmovsxbd %xmm8, %xmm12 +; XOP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,1,1] +; XOP-NEXT: vpmovsxbd %xmm8, %xmm8 +; XOP-NEXT: vblendvps %xmm5, %xmm3, %xmm4, %xmm5 +; XOP-NEXT: vpshld %xmm5, 16(%rdi,%rcx,4), %xmm5 +; XOP-NEXT: vblendvps %xmm9, %xmm0, %xmm1, %xmm9 +; XOP-NEXT: vpshld %xmm9, (%rdi,%rcx,4), %xmm9 +; XOP-NEXT: vblendvps %xmm6, %xmm3, %xmm4, %xmm6 ; XOP-NEXT: vpshld %xmm6, 48(%rdi,%rcx,4), %xmm6 -; XOP-NEXT: vblendvps %xmm10, %xmm9, %xmm14, %xmm5 -; XOP-NEXT: vpshld %xmm5, 32(%rdi,%rcx,4), %xmm5 -; XOP-NEXT: vblendvps %xmm7, %xmm15, %xmm4, %xmm7 +; XOP-NEXT: vblendvps %xmm10, %xmm0, %xmm1, %xmm10 +; XOP-NEXT: vpshld %xmm10, 32(%rdi,%rcx,4), %xmm10 +; XOP-NEXT: vblendvps %xmm7, %xmm3, %xmm4, %xmm7 ; XOP-NEXT: vpshld %xmm7, 80(%rdi,%rcx,4), %xmm7 -; XOP-NEXT: vblendvps %xmm11, %xmm9, %xmm14, %xmm1 -; XOP-NEXT: vpshld %xmm1, 64(%rdi,%rcx,4), %xmm1 -; XOP-NEXT: vblendvps %xmm2, %xmm15, %xmm4, %xmm2 -; XOP-NEXT: vpshld %xmm2, 112(%rdi,%rcx,4), %xmm2 -; XOP-NEXT: vblendvps %xmm12, %xmm9, %xmm14, %xmm3 -; XOP-NEXT: vpshld %xmm3, 96(%rdi,%rcx,4), %xmm3 -; XOP-NEXT: vmovdqu %xmm0, (%rdi,%rcx,4) -; XOP-NEXT: vmovdqu %xmm13, 16(%rdi,%rcx,4) -; XOP-NEXT: vmovdqu %xmm5, 32(%rdi,%rcx,4) +; XOP-NEXT: vblendvps %xmm11, %xmm0, %xmm1, %xmm11 +; XOP-NEXT: vpshld %xmm11, 64(%rdi,%rcx,4), %xmm11 +; XOP-NEXT: vblendvps %xmm8, %xmm3, %xmm4, %xmm8 +; XOP-NEXT: vpshld %xmm8, 112(%rdi,%rcx,4), %xmm8 +; XOP-NEXT: vblendvps %xmm12, %xmm0, %xmm1, %xmm12 +; XOP-NEXT: vpshld %xmm12, 96(%rdi,%rcx,4), %xmm12 +; XOP-NEXT: vmovdqu %xmm9, (%rdi,%rcx,4) +; XOP-NEXT: vmovdqu %xmm5, 16(%rdi,%rcx,4) +; XOP-NEXT: vmovdqu %xmm10, 32(%rdi,%rcx,4) ; XOP-NEXT: vmovdqu %xmm6, 48(%rdi,%rcx,4) -; XOP-NEXT: vmovdqu %xmm1, 64(%rdi,%rcx,4) +; XOP-NEXT: vmovdqu %xmm11, 64(%rdi,%rcx,4) ; XOP-NEXT: vmovdqu %xmm7, 80(%rdi,%rcx,4) -; XOP-NEXT: vmovdqu %xmm3, 96(%rdi,%rcx,4) -; XOP-NEXT: vmovdqu %xmm2, 112(%rdi,%rcx,4) +; XOP-NEXT: vmovdqu %xmm12, 96(%rdi,%rcx,4) +; XOP-NEXT: vmovdqu %xmm8, 112(%rdi,%rcx,4) ; XOP-NEXT: addq $32, %rcx ; XOP-NEXT: cmpq %rcx, %rdx ; XOP-NEXT: jne .LBB0_4 ; XOP-NEXT: # %bb.5: # %middle.block -; XOP-NEXT: cmpq %rax, %rdx +; XOP-NEXT: cmpq %r9, %rdx ; XOP-NEXT: jne .LBB0_6 ; XOP-NEXT: .LBB0_9: # %for.cond.cleanup ; XOP-NEXT: vzeroupper @@ -424,12 +422,12 @@ ; XOP-NEXT: # kill: def $cl killed $cl killed $ecx ; XOP-NEXT: shll %cl, (%rdi,%rdx,4) ; XOP-NEXT: incq %rdx -; XOP-NEXT: cmpq %rdx, %rax +; XOP-NEXT: cmpq %rdx, %r9 ; XOP-NEXT: je .LBB0_9 ; XOP-NEXT: .LBB0_6: # %for.body ; XOP-NEXT: # =>This Inner Loop Header: Depth=1 ; XOP-NEXT: cmpb $0, (%rsi,%rdx) -; XOP-NEXT: movl %r9d, %ecx +; XOP-NEXT: movl %eax, %ecx ; XOP-NEXT: je .LBB0_8 ; XOP-NEXT: # %bb.7: # %for.body ; XOP-NEXT: # in Loop: Header=BB0_6 Depth=1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll @@ -11,10 +11,10 @@ ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 @@ -23,24 +23,24 @@ ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = ; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa 176(%rdi), %xmm2 ; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm3 -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] @@ -153,39 +153,39 @@ ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = ; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa 176(%rdi), %xmm2 ; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm3 -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: f2: @@ -293,19 +293,19 @@ ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX2-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0 @@ -314,16 +314,16 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa 176(%rdi), %xmm5 -; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX2-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa 176(%rdi), %xmm3 +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vmovdqa 112(%rdi), %xmm3 +; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] @@ -441,39 +441,39 @@ ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = ; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa 176(%rdi), %xmm2 ; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm3 -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: f4: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -244,10 +244,10 @@ ; SSE2-NEXT: # kill: def $edx killed $edx def $rdx ; SSE2-NEXT: # kill: def $esi killed $esi def $rsi ; SSE2-NEXT: # kill: def $edi killed $edi def $rdi -; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; SSE2-NEXT: andl $7, %r10d ; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $7, %eax +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; SSE2-NEXT: andl $7, %r10d ; SSE2-NEXT: andl $7, %edi ; SSE2-NEXT: andl $7, %esi ; SSE2-NEXT: andl $7, %edx @@ -271,9 +271,9 @@ ; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %ecx ; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %eax ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] @@ -288,10 +288,10 @@ ; SSSE3-NEXT: # kill: def $edx killed $edx def $rdx ; SSSE3-NEXT: # kill: def $esi killed $esi def $rsi ; SSSE3-NEXT: # kill: def $edi killed $edi def $rdi -; SSSE3-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; SSSE3-NEXT: andl $7, %r10d ; SSSE3-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $7, %eax +; SSSE3-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; SSSE3-NEXT: andl $7, %r10d ; SSSE3-NEXT: andl $7, %edi ; SSSE3-NEXT: andl $7, %esi ; SSSE3-NEXT: andl $7, %edx @@ -315,9 +315,9 @@ ; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %ecx ; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %eax ; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] @@ -332,10 +332,10 @@ ; SSE41-NEXT: # kill: def $edx killed $edx def $rdx ; SSE41-NEXT: # kill: def $esi killed $esi def $rsi ; SSE41-NEXT: # kill: def $edi killed $edi def $rdi -; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; SSE41-NEXT: andl $7, %r10d ; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; SSE41-NEXT: andl $7, %eax +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; SSE41-NEXT: andl $7, %r10d ; SSE41-NEXT: andl $7, %edi ; SSE41-NEXT: andl $7, %esi ; SSE41-NEXT: andl $7, %edx @@ -350,8 +350,8 @@ ; SSE41-NEXT: pinsrw $3, -24(%rsp,%rcx,2), %xmm0 ; SSE41-NEXT: pinsrw $4, -24(%rsp,%r8,2), %xmm0 ; SSE41-NEXT: pinsrw $5, -24(%rsp,%r9,2), %xmm0 -; SSE41-NEXT: pinsrw $6, -24(%rsp,%rax,2), %xmm0 -; SSE41-NEXT: pinsrw $7, -24(%rsp,%r10,2), %xmm0 +; SSE41-NEXT: pinsrw $6, -24(%rsp,%r10,2), %xmm0 +; SSE41-NEXT: pinsrw $7, -24(%rsp,%rax,2), %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16: @@ -362,10 +362,10 @@ ; AVX-NEXT: # kill: def $edx killed $edx def $rdx ; AVX-NEXT: # kill: def $esi killed $esi def $rsi ; AVX-NEXT: # kill: def $edi killed $edi def $rdi -; AVX-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; AVX-NEXT: andl $7, %r10d ; AVX-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; AVX-NEXT: andl $7, %eax +; AVX-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; AVX-NEXT: andl $7, %r10d ; AVX-NEXT: andl $7, %edi ; AVX-NEXT: andl $7, %esi ; AVX-NEXT: andl $7, %edx @@ -380,8 +380,8 @@ ; AVX-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 ; AVX-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0 ; AVX-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $7, -24(%rsp,%r10,2), %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $6, -24(%rsp,%r10,2), %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX-NEXT: retq %x0 = extractelement <8 x i16> %x, i16 %i0 %x1 = extractelement <8 x i16> %x, i16 %i1 @@ -415,15 +415,15 @@ ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE2-NEXT: movd %eax, %xmm15 +; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: movd %eax, %xmm4 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax @@ -431,7 +431,7 @@ ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE2-NEXT: movd %eax, %xmm10 +; SSE2-NEXT: movd %eax, %xmm5 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax @@ -439,51 +439,51 @@ ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE2-NEXT: movd %eax, %xmm11 +; SSE2-NEXT: movd %eax, %xmm8 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE2-NEXT: movd %eax, %xmm6 ; SSE2-NEXT: andl $15, %ecx ; SSE2-NEXT: movzbl -24(%rsp,%rcx), %eax -; SSE2-NEXT: movd %eax, %xmm12 +; SSE2-NEXT: movd %eax, %xmm9 ; SSE2-NEXT: andl $15, %edx ; SSE2-NEXT: movzbl -24(%rsp,%rdx), %eax -; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: movd %eax, %xmm10 ; SSE2-NEXT: andl $15, %esi ; SSE2-NEXT: movzbl -24(%rsp,%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm13 +; SSE2-NEXT: movd %eax, %xmm11 ; SSE2-NEXT: andl $15, %edi ; SSE2-NEXT: movzbl -24(%rsp,%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: andl $15, %r9d ; SSE2-NEXT: movzbl -24(%rsp,%r9), %eax -; SSE2-NEXT: movd %eax, %xmm14 +; SSE2-NEXT: movd %eax, %xmm12 ; SSE2-NEXT: andl $15, %r8d ; SSE2-NEXT: movzbl -24(%rsp,%r8), %eax -; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movd %eax, %xmm13 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movd %eax, %xmm14 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE2-NEXT: movd %eax, %xmm15 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE2-NEXT: retq ; @@ -499,15 +499,15 @@ ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm8 +; SSSE3-NEXT: movd %eax, %xmm1 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm15 +; SSSE3-NEXT: movd %eax, %xmm2 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm9 +; SSSE3-NEXT: movd %eax, %xmm4 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -515,7 +515,7 @@ ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm10 +; SSSE3-NEXT: movd %eax, %xmm5 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -523,51 +523,51 @@ ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm11 +; SSSE3-NEXT: movd %eax, %xmm8 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm6 ; SSSE3-NEXT: andl $15, %ecx ; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm12 +; SSSE3-NEXT: movd %eax, %xmm9 ; SSSE3-NEXT: andl $15, %edx ; SSSE3-NEXT: movzbl -24(%rsp,%rdx), %eax -; SSSE3-NEXT: movd %eax, %xmm5 +; SSSE3-NEXT: movd %eax, %xmm10 ; SSSE3-NEXT: andl $15, %esi ; SSSE3-NEXT: movzbl -24(%rsp,%rsi), %eax -; SSSE3-NEXT: movd %eax, %xmm13 +; SSSE3-NEXT: movd %eax, %xmm11 ; SSSE3-NEXT: andl $15, %edi ; SSSE3-NEXT: movzbl -24(%rsp,%rdi), %eax ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: andl $15, %r9d ; SSSE3-NEXT: movzbl -24(%rsp,%r9), %eax -; SSSE3-NEXT: movd %eax, %xmm14 +; SSSE3-NEXT: movd %eax, %xmm12 ; SSSE3-NEXT: andl $15, %r8d ; SSSE3-NEXT: movzbl -24(%rsp,%r8), %eax -; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movd %eax, %xmm13 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movd %eax, %xmm14 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSSE3-NEXT: movd %eax, %xmm15 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSSE3-NEXT: retq ; @@ -820,85 +820,85 @@ ; SSE2-NEXT: pushq %rbx ; SSE2-NEXT: movzbl (%rdi), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl 1(%rdi), %r9d -; SSE2-NEXT: movzbl 2(%rdi), %r10d -; SSE2-NEXT: movzbl 3(%rdi), %r11d -; SSE2-NEXT: movzbl 4(%rdi), %r14d -; SSE2-NEXT: movzbl 5(%rdi), %r15d -; SSE2-NEXT: movzbl 6(%rdi), %r12d -; SSE2-NEXT: movzbl 7(%rdi), %r13d +; SSE2-NEXT: movzbl 1(%rdi), %ecx +; SSE2-NEXT: movzbl 2(%rdi), %edx +; SSE2-NEXT: movzbl 3(%rdi), %esi +; SSE2-NEXT: movzbl 4(%rdi), %r8d +; SSE2-NEXT: movzbl 5(%rdi), %r9d +; SSE2-NEXT: movzbl 6(%rdi), %r10d +; SSE2-NEXT: movzbl 7(%rdi), %r11d ; SSE2-NEXT: movzbl 8(%rdi), %ebx -; SSE2-NEXT: movzbl 9(%rdi), %r8d -; SSE2-NEXT: movzbl 10(%rdi), %ecx -; SSE2-NEXT: movzbl 11(%rdi), %edx -; SSE2-NEXT: movzbl 12(%rdi), %esi +; SSE2-NEXT: movzbl 9(%rdi), %r14d +; SSE2-NEXT: movzbl 10(%rdi), %r15d +; SSE2-NEXT: movzbl 11(%rdi), %r12d +; SSE2-NEXT: movzbl 12(%rdi), %r13d ; SSE2-NEXT: movzbl 13(%rdi), %ebp ; SSE2-NEXT: movzbl 14(%rdi), %eax ; SSE2-NEXT: movzbl 15(%rdi), %edi ; SSE2-NEXT: andl $15, %edi ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -24(%rsp,%rdi), %edi -; SSE2-NEXT: movd %edi, %xmm8 +; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE2-NEXT: movd %eax, %xmm15 +; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: andl $15, %ebp ; SSE2-NEXT: movzbl -24(%rsp,%rbp), %eax -; SSE2-NEXT: movd %eax, %xmm9 -; SSE2-NEXT: andl $15, %esi -; SSE2-NEXT: movzbl -24(%rsp,%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl -24(%rsp,%rdx), %eax -; SSE2-NEXT: movd %eax, %xmm10 -; SSE2-NEXT: andl $15, %ecx -; SSE2-NEXT: movzbl -24(%rsp,%rcx), %eax -; SSE2-NEXT: movd %eax, %xmm7 -; SSE2-NEXT: andl $15, %r8d -; SSE2-NEXT: movzbl -24(%rsp,%r8), %eax -; SSE2-NEXT: movd %eax, %xmm11 -; SSE2-NEXT: andl $15, %ebx -; SSE2-NEXT: movzbl -24(%rsp,%rbx), %eax -; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: movd %eax, %xmm4 ; SSE2-NEXT: andl $15, %r13d ; SSE2-NEXT: movzbl -24(%rsp,%r13), %eax -; SSE2-NEXT: movd %eax, %xmm12 +; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: andl $15, %r12d ; SSE2-NEXT: movzbl -24(%rsp,%r12), %eax ; SSE2-NEXT: movd %eax, %xmm5 ; SSE2-NEXT: andl $15, %r15d ; SSE2-NEXT: movzbl -24(%rsp,%r15), %eax -; SSE2-NEXT: movd %eax, %xmm13 +; SSE2-NEXT: movd %eax, %xmm7 ; SSE2-NEXT: andl $15, %r14d ; SSE2-NEXT: movzbl -24(%rsp,%r14), %eax -; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: andl $15, %ebx +; SSE2-NEXT: movzbl -24(%rsp,%rbx), %eax +; SSE2-NEXT: movd %eax, %xmm6 ; SSE2-NEXT: andl $15, %r11d ; SSE2-NEXT: movzbl -24(%rsp,%r11), %eax -; SSE2-NEXT: movd %eax, %xmm14 +; SSE2-NEXT: movd %eax, %xmm9 ; SSE2-NEXT: andl $15, %r10d ; SSE2-NEXT: movzbl -24(%rsp,%r10), %eax -; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movd %eax, %xmm10 ; SSE2-NEXT: andl $15, %r9d ; SSE2-NEXT: movzbl -24(%rsp,%r9), %eax -; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movd %eax, %xmm11 +; SSE2-NEXT: andl $15, %r8d +; SSE2-NEXT: movzbl -24(%rsp,%r8), %eax +; SSE2-NEXT: movd %eax, %xmm12 +; SSE2-NEXT: andl $15, %esi +; SSE2-NEXT: movzbl -24(%rsp,%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm13 +; SSE2-NEXT: andl $15, %edx +; SSE2-NEXT: movzbl -24(%rsp,%rdx), %eax +; SSE2-NEXT: movd %eax, %xmm14 +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %eax +; SSE2-NEXT: movd %eax, %xmm15 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 @@ -918,85 +918,85 @@ ; SSSE3-NEXT: pushq %rbx ; SSSE3-NEXT: movzbl (%rdi), %eax ; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSSE3-NEXT: movzbl 1(%rdi), %r9d -; SSSE3-NEXT: movzbl 2(%rdi), %r10d -; SSSE3-NEXT: movzbl 3(%rdi), %r11d -; SSSE3-NEXT: movzbl 4(%rdi), %r14d -; SSSE3-NEXT: movzbl 5(%rdi), %r15d -; SSSE3-NEXT: movzbl 6(%rdi), %r12d -; SSSE3-NEXT: movzbl 7(%rdi), %r13d +; SSSE3-NEXT: movzbl 1(%rdi), %ecx +; SSSE3-NEXT: movzbl 2(%rdi), %edx +; SSSE3-NEXT: movzbl 3(%rdi), %esi +; SSSE3-NEXT: movzbl 4(%rdi), %r8d +; SSSE3-NEXT: movzbl 5(%rdi), %r9d +; SSSE3-NEXT: movzbl 6(%rdi), %r10d +; SSSE3-NEXT: movzbl 7(%rdi), %r11d ; SSSE3-NEXT: movzbl 8(%rdi), %ebx -; SSSE3-NEXT: movzbl 9(%rdi), %r8d -; SSSE3-NEXT: movzbl 10(%rdi), %ecx -; SSSE3-NEXT: movzbl 11(%rdi), %edx -; SSSE3-NEXT: movzbl 12(%rdi), %esi +; SSSE3-NEXT: movzbl 9(%rdi), %r14d +; SSSE3-NEXT: movzbl 10(%rdi), %r15d +; SSSE3-NEXT: movzbl 11(%rdi), %r12d +; SSSE3-NEXT: movzbl 12(%rdi), %r13d ; SSSE3-NEXT: movzbl 13(%rdi), %ebp ; SSSE3-NEXT: movzbl 14(%rdi), %eax ; SSSE3-NEXT: movzbl 15(%rdi), %edi ; SSSE3-NEXT: andl $15, %edi ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl -24(%rsp,%rdi), %edi -; SSSE3-NEXT: movd %edi, %xmm8 +; SSSE3-NEXT: movd %edi, %xmm1 ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm15 +; SSSE3-NEXT: movd %eax, %xmm2 ; SSSE3-NEXT: andl $15, %ebp ; SSSE3-NEXT: movzbl -24(%rsp,%rbp), %eax -; SSSE3-NEXT: movd %eax, %xmm9 -; SSSE3-NEXT: andl $15, %esi -; SSSE3-NEXT: movzbl -24(%rsp,%rsi), %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl -24(%rsp,%rdx), %eax -; SSSE3-NEXT: movd %eax, %xmm10 -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm7 -; SSSE3-NEXT: andl $15, %r8d -; SSSE3-NEXT: movzbl -24(%rsp,%r8), %eax -; SSSE3-NEXT: movd %eax, %xmm11 -; SSSE3-NEXT: andl $15, %ebx -; SSSE3-NEXT: movzbl -24(%rsp,%rbx), %eax -; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movd %eax, %xmm4 ; SSSE3-NEXT: andl $15, %r13d ; SSSE3-NEXT: movzbl -24(%rsp,%r13), %eax -; SSSE3-NEXT: movd %eax, %xmm12 +; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: andl $15, %r12d ; SSSE3-NEXT: movzbl -24(%rsp,%r12), %eax ; SSSE3-NEXT: movd %eax, %xmm5 ; SSSE3-NEXT: andl $15, %r15d ; SSSE3-NEXT: movzbl -24(%rsp,%r15), %eax -; SSSE3-NEXT: movd %eax, %xmm13 +; SSSE3-NEXT: movd %eax, %xmm7 ; SSSE3-NEXT: andl $15, %r14d ; SSSE3-NEXT: movzbl -24(%rsp,%r14), %eax -; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movd %eax, %xmm8 +; SSSE3-NEXT: andl $15, %ebx +; SSSE3-NEXT: movzbl -24(%rsp,%rbx), %eax +; SSSE3-NEXT: movd %eax, %xmm6 ; SSSE3-NEXT: andl $15, %r11d ; SSSE3-NEXT: movzbl -24(%rsp,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm14 +; SSSE3-NEXT: movd %eax, %xmm9 ; SSSE3-NEXT: andl $15, %r10d ; SSSE3-NEXT: movzbl -24(%rsp,%r10), %eax -; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movd %eax, %xmm10 ; SSSE3-NEXT: andl $15, %r9d ; SSSE3-NEXT: movzbl -24(%rsp,%r9), %eax -; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: movd %eax, %xmm11 +; SSSE3-NEXT: andl $15, %r8d +; SSSE3-NEXT: movzbl -24(%rsp,%r8), %eax +; SSSE3-NEXT: movd %eax, %xmm12 +; SSSE3-NEXT: andl $15, %esi +; SSSE3-NEXT: movzbl -24(%rsp,%rsi), %eax +; SSSE3-NEXT: movd %eax, %xmm13 +; SSSE3-NEXT: andl $15, %edx +; SSSE3-NEXT: movzbl -24(%rsp,%rdx), %eax +; SSSE3-NEXT: movd %eax, %xmm14 +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %eax +; SSSE3-NEXT: movd %eax, %xmm15 ; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 @@ -1014,52 +1014,52 @@ ; SSE41-NEXT: pushq %r13 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movzbl (%rdi), %r9d -; SSE41-NEXT: andl $15, %r9d -; SSE41-NEXT: movzbl 1(%rdi), %ebx -; SSE41-NEXT: movzbl 2(%rdi), %eax +; SSE41-NEXT: movzbl (%rdi), %ecx +; SSE41-NEXT: andl $15, %ecx +; SSE41-NEXT: movzbl 1(%rdi), %eax ; SSE41-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE41-NEXT: movzbl 3(%rdi), %r11d -; SSE41-NEXT: movzbl 4(%rdi), %r14d -; SSE41-NEXT: movzbl 5(%rdi), %r15d -; SSE41-NEXT: movzbl 6(%rdi), %r12d -; SSE41-NEXT: movzbl 7(%rdi), %r13d -; SSE41-NEXT: movzbl 8(%rdi), %r10d -; SSE41-NEXT: movzbl 9(%rdi), %r8d -; SSE41-NEXT: movzbl 10(%rdi), %ecx -; SSE41-NEXT: movzbl 11(%rdi), %edx -; SSE41-NEXT: movzbl 12(%rdi), %esi +; SSE41-NEXT: movzbl 2(%rdi), %edx +; SSE41-NEXT: movzbl 3(%rdi), %esi +; SSE41-NEXT: movzbl 4(%rdi), %r8d +; SSE41-NEXT: movzbl 5(%rdi), %r9d +; SSE41-NEXT: movzbl 6(%rdi), %r10d +; SSE41-NEXT: movzbl 7(%rdi), %r11d +; SSE41-NEXT: movzbl 8(%rdi), %ebx +; SSE41-NEXT: movzbl 9(%rdi), %r14d +; SSE41-NEXT: movzbl 10(%rdi), %r15d +; SSE41-NEXT: movzbl 11(%rdi), %r12d +; SSE41-NEXT: movzbl 12(%rdi), %r13d ; SSE41-NEXT: movzbl 13(%rdi), %ebp ; SSE41-NEXT: movzbl 14(%rdi), %eax ; SSE41-NEXT: movzbl 15(%rdi), %edi ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movzbl -24(%rsp,%r9), %r9d -; SSE41-NEXT: movd %r9d, %xmm0 -; SSE41-NEXT: andl $15, %ebx -; SSE41-NEXT: pinsrb $1, -24(%rsp,%rbx), %xmm0 -; SSE41-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; SSE41-NEXT: andl $15, %ebx -; SSE41-NEXT: pinsrb $2, -24(%rsp,%rbx), %xmm0 +; SSE41-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE41-NEXT: andl $15, %ecx +; SSE41-NEXT: pinsrb $1, -24(%rsp,%rcx), %xmm0 +; SSE41-NEXT: andl $15, %edx +; SSE41-NEXT: pinsrb $2, -24(%rsp,%rdx), %xmm0 +; SSE41-NEXT: andl $15, %esi +; SSE41-NEXT: pinsrb $3, -24(%rsp,%rsi), %xmm0 +; SSE41-NEXT: andl $15, %r8d +; SSE41-NEXT: pinsrb $4, -24(%rsp,%r8), %xmm0 +; SSE41-NEXT: andl $15, %r9d +; SSE41-NEXT: pinsrb $5, -24(%rsp,%r9), %xmm0 +; SSE41-NEXT: andl $15, %r10d +; SSE41-NEXT: pinsrb $6, -24(%rsp,%r10), %xmm0 ; SSE41-NEXT: andl $15, %r11d -; SSE41-NEXT: pinsrb $3, -24(%rsp,%r11), %xmm0 +; SSE41-NEXT: pinsrb $7, -24(%rsp,%r11), %xmm0 +; SSE41-NEXT: andl $15, %ebx +; SSE41-NEXT: pinsrb $8, -24(%rsp,%rbx), %xmm0 ; SSE41-NEXT: andl $15, %r14d -; SSE41-NEXT: pinsrb $4, -24(%rsp,%r14), %xmm0 +; SSE41-NEXT: pinsrb $9, -24(%rsp,%r14), %xmm0 ; SSE41-NEXT: andl $15, %r15d -; SSE41-NEXT: pinsrb $5, -24(%rsp,%r15), %xmm0 +; SSE41-NEXT: pinsrb $10, -24(%rsp,%r15), %xmm0 ; SSE41-NEXT: andl $15, %r12d -; SSE41-NEXT: pinsrb $6, -24(%rsp,%r12), %xmm0 +; SSE41-NEXT: pinsrb $11, -24(%rsp,%r12), %xmm0 ; SSE41-NEXT: andl $15, %r13d -; SSE41-NEXT: pinsrb $7, -24(%rsp,%r13), %xmm0 -; SSE41-NEXT: andl $15, %r10d -; SSE41-NEXT: pinsrb $8, -24(%rsp,%r10), %xmm0 -; SSE41-NEXT: andl $15, %r8d -; SSE41-NEXT: pinsrb $9, -24(%rsp,%r8), %xmm0 -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: pinsrb $10, -24(%rsp,%rcx), %xmm0 -; SSE41-NEXT: andl $15, %edx -; SSE41-NEXT: pinsrb $11, -24(%rsp,%rdx), %xmm0 -; SSE41-NEXT: andl $15, %esi -; SSE41-NEXT: pinsrb $12, -24(%rsp,%rsi), %xmm0 +; SSE41-NEXT: pinsrb $12, -24(%rsp,%r13), %xmm0 ; SSE41-NEXT: andl $15, %ebp ; SSE41-NEXT: pinsrb $13, -24(%rsp,%rbp), %xmm0 ; SSE41-NEXT: andl $15, %eax @@ -1082,52 +1082,52 @@ ; AVX-NEXT: pushq %r13 ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movzbl (%rdi), %r9d -; AVX-NEXT: andl $15, %r9d -; AVX-NEXT: movzbl 1(%rdi), %ebx -; AVX-NEXT: movzbl 2(%rdi), %eax +; AVX-NEXT: movzbl (%rdi), %ecx +; AVX-NEXT: andl $15, %ecx +; AVX-NEXT: movzbl 1(%rdi), %eax ; AVX-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX-NEXT: movzbl 3(%rdi), %r11d -; AVX-NEXT: movzbl 4(%rdi), %r14d -; AVX-NEXT: movzbl 5(%rdi), %r15d -; AVX-NEXT: movzbl 6(%rdi), %r12d -; AVX-NEXT: movzbl 7(%rdi), %r13d -; AVX-NEXT: movzbl 8(%rdi), %r10d -; AVX-NEXT: movzbl 9(%rdi), %r8d -; AVX-NEXT: movzbl 10(%rdi), %ecx -; AVX-NEXT: movzbl 11(%rdi), %edx -; AVX-NEXT: movzbl 12(%rdi), %esi +; AVX-NEXT: movzbl 2(%rdi), %edx +; AVX-NEXT: movzbl 3(%rdi), %esi +; AVX-NEXT: movzbl 4(%rdi), %r8d +; AVX-NEXT: movzbl 5(%rdi), %r9d +; AVX-NEXT: movzbl 6(%rdi), %r10d +; AVX-NEXT: movzbl 7(%rdi), %r11d +; AVX-NEXT: movzbl 8(%rdi), %ebx +; AVX-NEXT: movzbl 9(%rdi), %r14d +; AVX-NEXT: movzbl 10(%rdi), %r15d +; AVX-NEXT: movzbl 11(%rdi), %r12d +; AVX-NEXT: movzbl 12(%rdi), %r13d ; AVX-NEXT: movzbl 13(%rdi), %ebp ; AVX-NEXT: movzbl 14(%rdi), %eax ; AVX-NEXT: movzbl 15(%rdi), %edi ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movzbl -24(%rsp,%r9), %r9d -; AVX-NEXT: vmovd %r9d, %xmm0 -; AVX-NEXT: andl $15, %ebx -; AVX-NEXT: vpinsrb $1, -24(%rsp,%rbx), %xmm0, %xmm0 -; AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX-NEXT: andl $15, %ebx -; AVX-NEXT: vpinsrb $2, -24(%rsp,%rbx), %xmm0, %xmm0 +; AVX-NEXT: movzbl -24(%rsp,%rcx), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX-NEXT: andl $15, %ecx +; AVX-NEXT: vpinsrb $1, -24(%rsp,%rcx), %xmm0, %xmm0 +; AVX-NEXT: andl $15, %edx +; AVX-NEXT: vpinsrb $2, -24(%rsp,%rdx), %xmm0, %xmm0 +; AVX-NEXT: andl $15, %esi +; AVX-NEXT: vpinsrb $3, -24(%rsp,%rsi), %xmm0, %xmm0 +; AVX-NEXT: andl $15, %r8d +; AVX-NEXT: vpinsrb $4, -24(%rsp,%r8), %xmm0, %xmm0 +; AVX-NEXT: andl $15, %r9d +; AVX-NEXT: vpinsrb $5, -24(%rsp,%r9), %xmm0, %xmm0 +; AVX-NEXT: andl $15, %r10d +; AVX-NEXT: vpinsrb $6, -24(%rsp,%r10), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %r11d -; AVX-NEXT: vpinsrb $3, -24(%rsp,%r11), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $7, -24(%rsp,%r11), %xmm0, %xmm0 +; AVX-NEXT: andl $15, %ebx +; AVX-NEXT: vpinsrb $8, -24(%rsp,%rbx), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %r14d -; AVX-NEXT: vpinsrb $4, -24(%rsp,%r14), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $9, -24(%rsp,%r14), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %r15d -; AVX-NEXT: vpinsrb $5, -24(%rsp,%r15), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $10, -24(%rsp,%r15), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %r12d -; AVX-NEXT: vpinsrb $6, -24(%rsp,%r12), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $11, -24(%rsp,%r12), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %r13d -; AVX-NEXT: vpinsrb $7, -24(%rsp,%r13), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %r10d -; AVX-NEXT: vpinsrb $8, -24(%rsp,%r10), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %r8d -; AVX-NEXT: vpinsrb $9, -24(%rsp,%r8), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $10, -24(%rsp,%rcx), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %edx -; AVX-NEXT: vpinsrb $11, -24(%rsp,%rdx), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: vpinsrb $12, -24(%rsp,%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $12, -24(%rsp,%r13), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %ebp ; AVX-NEXT: vpinsrb $13, -24(%rsp,%rbp), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %eax diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -189,10 +189,10 @@ ; ALL-NEXT: # kill: def $edx killed $edx def $rdx ; ALL-NEXT: # kill: def $esi killed $esi def $rsi ; ALL-NEXT: # kill: def $edi killed $edi def $rdi -; ALL-NEXT: movl 24(%rbp), %r10d -; ALL-NEXT: andl $7, %r10d -; ALL-NEXT: movl 16(%rbp), %eax +; ALL-NEXT: movl 24(%rbp), %eax ; ALL-NEXT: andl $7, %eax +; ALL-NEXT: movl 16(%rbp), %r10d +; ALL-NEXT: andl $7, %r10d ; ALL-NEXT: andl $7, %edi ; ALL-NEXT: andl $7, %esi ; ALL-NEXT: andl $7, %edx @@ -240,10 +240,10 @@ ; ALL-NEXT: # kill: def $edx killed $edx def $rdx ; ALL-NEXT: # kill: def $esi killed $esi def $rsi ; ALL-NEXT: # kill: def $edi killed $edi def $rdi -; ALL-NEXT: movl {{[0-9]+}}(%rsp), %r10d -; ALL-NEXT: andl $3, %r10d ; ALL-NEXT: movl {{[0-9]+}}(%rsp), %eax ; ALL-NEXT: andl $3, %eax +; ALL-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; ALL-NEXT: andl $3, %r10d ; ALL-NEXT: andl $3, %edi ; ALL-NEXT: andl $3, %esi ; ALL-NEXT: andl $3, %edx diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -2308,7 +2308,7 @@ ; ; AVX1-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm8 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5 @@ -2320,18 +2320,18 @@ ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255] +; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpand %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6 ; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 +; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm3 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 ; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -321,110 +321,110 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSE2-LABEL: trunc_packus_v4i64_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] ; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm5 ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] ; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_packus_v4i64_v4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pxor %xmm2, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; SSSE3-NEXT: pxor %xmm6, %xmm6 ; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; SSSE3-NEXT: movdqa %xmm3, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647] +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] ; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: pandn %xmm3, %xmm5 ; SSSE3-NEXT: por %xmm5, %xmm0 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pxor %xmm2, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] ; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm3, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v4i64_v4i32: @@ -589,265 +589,265 @@ ; SSE2-LABEL: trunc_packus_v8i64_v8i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm3 -; SSE2-NEXT: movdqa 16(%rdi), %xmm4 +; SSE2-NEXT: movdqa 16(%rdi), %xmm8 ; SSE2-NEXT: movdqa 32(%rdi), %xmm6 -; SSE2-NEXT: movdqa 48(%rdi), %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483647,2147483647] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm10 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm8 +; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: por %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm6, %xmm8 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_packus_v8i64_v8i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa (%rdi), %xmm3 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm4 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm8 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm6 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm10 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa 48(%rdi), %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm11, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483647,2147483647] -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] +; SSSE3-NEXT: movdqa %xmm5, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm6, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm6, %xmm4 -; SSSE3-NEXT: movdqa %xmm10, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm10 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm10, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm5 -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: movdqa %xmm5, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm8 +; SSSE3-NEXT: pandn %xmm4, %xmm3 +; SSSE3-NEXT: por %xmm8, %xmm3 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pxor %xmm0, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: movdqa %xmm5, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm6 +; SSSE3-NEXT: pandn %xmm4, %xmm8 +; SSSE3-NEXT: por %xmm6, %xmm8 +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: pxor %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm5 -; SSSE3-NEXT: pand %xmm1, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 ; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pand %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm8, %xmm1 +; SSSE3-NEXT: pxor %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm1 +; SSSE3-NEXT: pand %xmm8, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm0, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm0 ; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v8i64_v8i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm5 -; SSE41-NEXT: movdqa 16(%rdi), %xmm4 -; SSE41-NEXT: movdqa 32(%rdi), %xmm10 -; SSE41-NEXT: movdqa 48(%rdi), %xmm9 +; SSE41-NEXT: movdqa 16(%rdi), %xmm8 +; SSE41-NEXT: movdqa 32(%rdi), %xmm7 +; SSE41-NEXT: movdqa 48(%rdi), %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [4294967295,4294967295] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm2, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm8 -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647] +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5 -; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 +; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: xorpd %xmm2, %xmm2 ; SSE41-NEXT: movapd %xmm1, %xmm6 ; SSE41-NEXT: xorpd %xmm3, %xmm6 ; SSE41-NEXT: movapd %xmm6, %xmm7 @@ -858,7 +858,7 @@ ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: movapd %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm8, %xmm1 ; SSE41-NEXT: xorpd %xmm3, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 @@ -867,28 +867,28 @@ ; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2] -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movapd %xmm8, %xmm5 -; SSE41-NEXT: xorpd %xmm3, %xmm5 ; SSE41-NEXT: movapd %xmm5, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE41-NEXT: xorpd %xmm3, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: xorpd %xmm3, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2] ; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -1368,46 +1368,46 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { ; SSE2-LABEL: trunc_packus_v4i64_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] ; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm4 @@ -1428,46 +1428,46 @@ ; ; SSSE3-LABEL: trunc_packus_v4i64_v4i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm4, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pand %xmm5, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] ; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pand %xmm4, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm4 @@ -1620,46 +1620,46 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE2-LABEL: trunc_packus_v4i64_v4i16_store: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] ; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm4 @@ -1681,46 +1681,46 @@ ; ; SSSE3-LABEL: trunc_packus_v4i64_v4i16_store: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm4, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pand %xmm5, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] ; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pand %xmm4, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm4 @@ -1879,115 +1879,115 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-LABEL: trunc_packus_v8i64_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 +; SSE2-NEXT: movdqa (%rdi), %xmm8 ; SSE2-NEXT: movdqa 16(%rdi), %xmm2 -; SSE2-NEXT: movdqa 32(%rdi), %xmm10 +; SSE2-NEXT: movdqa 32(%rdi), %xmm3 ; SSE2-NEXT: movdqa 48(%rdi), %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm10 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm10, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm5 +; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm8 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm6, %xmm8 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm9, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm4 +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] @@ -1997,115 +1997,115 @@ ; ; SSSE3-LABEL: trunc_packus_v8i64_v8i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm4 +; SSSE3-NEXT: movdqa (%rdi), %xmm8 ; SSSE3-NEXT: movdqa 16(%rdi), %xmm2 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm10 +; SSSE3-NEXT: movdqa 32(%rdi), %xmm3 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535] +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] +; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm5, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: pandn %xmm4, %xmm1 ; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm6, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm6, %xmm4 -; SSSE3-NEXT: movdqa %xmm10, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm10 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm10, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm5 +; SSSE3-NEXT: movdqa %xmm8, %xmm2 +; SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: movdqa %xmm5, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm8 +; SSSE3-NEXT: pandn %xmm4, %xmm2 +; SSSE3-NEXT: por %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pxor %xmm0, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: movdqa %xmm5, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm6 +; SSSE3-NEXT: pandn %xmm4, %xmm8 +; SSSE3-NEXT: por %xmm6, %xmm8 +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: pxor %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pand %xmm9, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm5 -; SSSE3-NEXT: pand %xmm4, %xmm5 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4 +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: movdqa %xmm8, %xmm4 +; SSSE3-NEXT: pxor %xmm0, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm4 +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm2, %xmm6 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm2 ; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] @@ -2115,67 +2115,67 @@ ; ; SSE41-LABEL: trunc_packus_v8i64_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm10 -; SSE41-NEXT: movdqa 16(%rdi), %xmm9 -; SSE41-NEXT: movdqa 32(%rdi), %xmm3 -; SSE41-NEXT: movdqa 48(%rdi), %xmm5 +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm5 +; SSE41-NEXT: movdqa 32(%rdi), %xmm4 +; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: xorpd %xmm5, %xmm5 ; SSE41-NEXT: movapd %xmm1, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: xorpd %xmm2, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm7 +; SSE41-NEXT: xorpd %xmm2, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: movapd %xmm8, %xmm1 ; SSE41-NEXT: xorpd %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 @@ -2184,28 +2184,28 @@ ; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: packusdw %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm6, %xmm3 -; SSE41-NEXT: xorpd %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: packusdw %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: xorpd %xmm2, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 ; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 -; SSE41-NEXT: movapd %xmm8, %xmm4 ; SSE41-NEXT: xorpd %xmm2, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE41-NEXT: movapd %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: packusdw %xmm3, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: packusdw %xmm6, %xmm5 ; SSE41-NEXT: packusdw %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -3019,57 +3019,57 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; SSE2-LABEL: trunc_packus_v4i64_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm1, %xmm4 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] ; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: packuswb %xmm3, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 @@ -3077,35 +3077,35 @@ ; ; SSSE3-LABEL: trunc_packus_v4i64_v4i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm4, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pand %xmm5, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] ; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm5, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 @@ -3275,105 +3275,105 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE2-LABEL: trunc_packus_v4i64_v4i8_store: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm1, %xmm4 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] ; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm6, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: movd %xmm2, (%rdi) +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: packuswb %xmm3, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm5 +; SSE2-NEXT: movd %xmm5, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_packus_v4i64_v4i8_store: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm4, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pand %xmm5, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] ; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pand %xmm4, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm4 @@ -3539,110 +3539,110 @@ ; SSE2-LABEL: trunc_packus_v8i64_v8i8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm5 -; SSE2-NEXT: movdqa 16(%rdi), %xmm10 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm3 -; SSE2-NEXT: movdqa 48(%rdi), %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] +; SSE2-NEXT: movdqa 48(%rdi), %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm10, %xmm1 -; SSE2-NEXT: pxor %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm8 +; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: por %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm5, %xmm8 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: packuswb %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm5, %xmm3 ; SSE2-NEXT: packuswb %xmm3, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq @@ -3650,177 +3650,177 @@ ; SSSE3-LABEL: trunc_packus_v8i64_v8i8: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa (%rdi), %xmm5 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm10 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm3 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa 48(%rdi), %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm11, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm6, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pxor %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm10, %xmm1 -; SSSE3-NEXT: pxor %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: movdqa %xmm6, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm8 +; SSSE3-NEXT: pandn %xmm4, %xmm3 +; SSSE3-NEXT: por %xmm8, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pxor %xmm1, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: movdqa %xmm6, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pandn %xmm4, %xmm8 +; SSSE3-NEXT: por %xmm5, %xmm8 +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSSE3-NEXT: por %xmm0, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm10 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm11, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: movdqa %xmm8, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: pand %xmm0, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] ; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: packuswb %xmm5, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm11, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: packuswb %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm11, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm3, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: packuswb %xmm4, %xmm3 +; SSSE3-NEXT: packuswb %xmm5, %xmm3 ; SSSE3-NEXT: packuswb %xmm3, %xmm0 ; SSSE3-NEXT: packuswb %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v8i64_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm10 -; SSE41-NEXT: movdqa 16(%rdi), %xmm9 -; SSE41-NEXT: movdqa 32(%rdi), %xmm3 -; SSE41-NEXT: movdqa 48(%rdi), %xmm5 +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm5 +; SSE41-NEXT: movdqa 32(%rdi), %xmm4 +; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: xorpd %xmm5, %xmm5 ; SSE41-NEXT: movapd %xmm1, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: xorpd %xmm2, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm7 +; SSE41-NEXT: xorpd %xmm2, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: movapd %xmm8, %xmm1 ; SSE41-NEXT: xorpd %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 @@ -3829,28 +3829,28 @@ ; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: packusdw %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm6, %xmm3 -; SSE41-NEXT: xorpd %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: packusdw %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: xorpd %xmm2, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 ; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 -; SSE41-NEXT: movapd %xmm8, %xmm4 ; SSE41-NEXT: xorpd %xmm2, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE41-NEXT: movapd %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: packusdw %xmm3, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: packusdw %xmm6, %xmm5 ; SSE41-NEXT: packusdw %xmm5, %xmm1 ; SSE41-NEXT: packuswb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -3939,110 +3939,110 @@ ; SSE2-LABEL: trunc_packus_v8i64_v8i8_store: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm5 -; SSE2-NEXT: movdqa 16(%rdi), %xmm10 +; SSE2-NEXT: movdqa 16(%rdi), %xmm3 ; SSE2-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-NEXT: movdqa 48(%rdi), %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] +; SSE2-NEXT: movdqa 48(%rdi), %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm8 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm5, %xmm8 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm10 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: packuswb %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: packuswb %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm5, %xmm2 ; SSE2-NEXT: packuswb %xmm2, %xmm3 ; SSE2-NEXT: packuswb %xmm3, %xmm3 ; SSE2-NEXT: movq %xmm3, (%rsi) @@ -4051,110 +4051,110 @@ ; SSSE3-LABEL: trunc_packus_v8i64_v8i8_store: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa (%rdi), %xmm5 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm10 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm3 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm2 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa 48(%rdi), %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] +; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm6, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: pandn %xmm4, %xmm1 ; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm10, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm8, %xmm2 +; SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: movdqa %xmm6, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm8 +; SSSE3-NEXT: pandn %xmm4, %xmm2 +; SSSE3-NEXT: por %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pxor %xmm0, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: movdqa %xmm6, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pandn %xmm4, %xmm8 +; SSSE3-NEXT: por %xmm5, %xmm8 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm3 +; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm10 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm8, %xmm3 +; SSSE3-NEXT: packuswb %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm0, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm5 -; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: packuswb %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm2 ; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: packuswb %xmm4, %xmm2 +; SSSE3-NEXT: packuswb %xmm5, %xmm2 ; SSSE3-NEXT: packuswb %xmm2, %xmm3 ; SSSE3-NEXT: packuswb %xmm3, %xmm3 ; SSSE3-NEXT: movq %xmm3, (%rsi) @@ -4162,67 +4162,67 @@ ; ; SSE41-LABEL: trunc_packus_v8i64_v8i8_store: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm10 -; SSE41-NEXT: movdqa 16(%rdi), %xmm9 -; SSE41-NEXT: movdqa 32(%rdi), %xmm2 -; SSE41-NEXT: movdqa 48(%rdi), %xmm5 +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm5 +; SSE41-NEXT: movdqa 32(%rdi), %xmm3 +; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm3, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: xorpd %xmm5, %xmm5 ; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: movapd %xmm4, %xmm3 -; SSE41-NEXT: xorpd %xmm1, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: xorpd %xmm1, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 +; SSE41-NEXT: movapd %xmm8, %xmm4 ; SSE41-NEXT: xorpd %xmm1, %xmm4 ; SSE41-NEXT: movapd %xmm4, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 @@ -4231,28 +4231,28 @@ ; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: packusdw %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm6, %xmm2 -; SSE41-NEXT: xorpd %xmm1, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: packusdw %xmm6, %xmm4 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: xorpd %xmm1, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 ; SSE41-NEXT: movapd %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 -; SSE41-NEXT: movapd %xmm8, %xmm3 ; SSE41-NEXT: xorpd %xmm1, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm6 +; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: packusdw %xmm2, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: packusdw %xmm6, %xmm5 ; SSE41-NEXT: packusdw %xmm5, %xmm4 ; SSE41-NEXT: packuswb %xmm4, %xmm4 ; SSE41-NEXT: movq %xmm4, (%rsi) @@ -4344,618 +4344,618 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-LABEL: trunc_packus_v16i64_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm11 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm15 -; SSE2-NEXT: movdqa 48(%rdi), %xmm12 -; SSE2-NEXT: movdqa 80(%rdi), %xmm2 +; SSE2-NEXT: movdqa (%rdi), %xmm7 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa 32(%rdi), %xmm12 +; SSE2-NEXT: movdqa 48(%rdi), %xmm11 +; SSE2-NEXT: movdqa 80(%rdi), %xmm10 ; SSE2-NEXT: movdqa 64(%rdi), %xmm5 -; SSE2-NEXT: movdqa 112(%rdi), %xmm3 -; SSE2-NEXT: movdqa 96(%rdi), %xmm14 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSE2-NEXT: movdqa 112(%rdi), %xmm4 +; SSE2-NEXT: movdqa 96(%rdi), %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: pxor %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm8, %xmm14 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm15 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,3,3] +; SSE2-NEXT: por %xmm15, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm6, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 +; SSE2-NEXT: movdqa %xmm8, %xmm14 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm15 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,3,3] +; SSE2-NEXT: por %xmm15, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm13 -; SSE2-NEXT: pand %xmm13, %xmm14 -; SSE2-NEXT: pandn %xmm8, %xmm13 -; SSE2-NEXT: por %xmm14, %xmm13 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm14 -; SSE2-NEXT: pand %xmm14, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm14 -; SSE2-NEXT: por %xmm3, %xmm14 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 +; SSE2-NEXT: movdqa %xmm8, %xmm14 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm15 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3] +; SSE2-NEXT: por %xmm15, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm6, %xmm4 ; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm15, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm15 -; SSE2-NEXT: pandn %xmm8, %xmm7 -; SSE2-NEXT: por %xmm15, %xmm7 -; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm15 -; SSE2-NEXT: pand %xmm15, %xmm12 -; SSE2-NEXT: pandn %xmm8, %xmm15 -; SSE2-NEXT: por %xmm12, %xmm15 -; SSE2-NEXT: movdqa %xmm11, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm12 +; SSE2-NEXT: movdqa %xmm10, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 +; SSE2-NEXT: movdqa %xmm8, %xmm14 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm15 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3] +; SSE2-NEXT: por %xmm15, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm10 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm10, %xmm5 +; SSE2-NEXT: movdqa %xmm12, %xmm10 +; SSE2-NEXT: pxor %xmm1, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 +; SSE2-NEXT: movdqa %xmm8, %xmm14 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm15 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm14[1,1,3,3] +; SSE2-NEXT: por %xmm15, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm12 +; SSE2-NEXT: pandn %xmm6, %xmm10 +; SSE2-NEXT: por %xmm12, %xmm10 +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 +; SSE2-NEXT: movdqa %xmm8, %xmm14 +; SSE2-NEXT: pcmpgtd %xmm12, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm15 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,3,3] +; SSE2-NEXT: por %xmm15, %xmm12 ; SSE2-NEXT: pand %xmm12, %xmm11 -; SSE2-NEXT: pandn %xmm8, %xmm12 +; SSE2-NEXT: pandn %xmm6, %xmm12 ; SSE2-NEXT: por %xmm11, %xmm12 -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm9 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm9, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: pxor %xmm1, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 +; SSE2-NEXT: movdqa %xmm8, %xmm14 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm15 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3] +; SSE2-NEXT: por %xmm15, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm7 +; SSE2-NEXT: pandn %xmm6, %xmm11 +; SSE2-NEXT: por %xmm7, %xmm11 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm8 +; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm2, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm11, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2] +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm12, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm0 +; SSE2-NEXT: pand %xmm11, %xmm0 ; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm15, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm15, %xmm3 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm12, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm12, %xmm7 +; SSE2-NEXT: movdqa %xmm10, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm8 +; SSE2-NEXT: pand %xmm10, %xmm8 +; SSE2-NEXT: packuswb %xmm7, %xmm8 +; SSE2-NEXT: packuswb %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: packuswb %xmm7, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm7, %xmm6 -; SSE2-NEXT: packuswb %xmm3, %xmm6 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm6 ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm14, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm14, %xmm4 -; SSE2-NEXT: movdqa %xmm13, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm13, %xmm3 -; SSE2-NEXT: packuswb %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_packus_v16i64_v16i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm11 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm9 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm15 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm12 -; SSSE3-NEXT: movdqa 80(%rdi), %xmm2 +; SSSE3-NEXT: movdqa (%rdi), %xmm7 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 +; SSSE3-NEXT: movdqa 32(%rdi), %xmm12 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm11 +; SSSE3-NEXT: movdqa 80(%rdi), %xmm10 ; SSSE3-NEXT: movdqa 64(%rdi), %xmm5 -; SSSE3-NEXT: movdqa 112(%rdi), %xmm3 -; SSSE3-NEXT: movdqa 96(%rdi), %xmm14 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSSE3-NEXT: movdqa 112(%rdi), %xmm4 +; SSSE3-NEXT: movdqa 96(%rdi), %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [255,255] ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm14, %xmm7 -; SSSE3-NEXT: pxor %xmm1, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; SSSE3-NEXT: pxor %xmm10, %xmm10 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm6, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm0, %xmm7 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm8, %xmm14 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm15 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,3,3] +; SSSE3-NEXT: por %xmm15, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm6, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 +; SSSE3-NEXT: movdqa %xmm8, %xmm14 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm15 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,3,3] +; SSSE3-NEXT: por %xmm15, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm4 +; SSSE3-NEXT: pandn %xmm6, %xmm3 +; SSSE3-NEXT: por %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pxor %xmm1, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm13 -; SSSE3-NEXT: pand %xmm13, %xmm14 -; SSSE3-NEXT: pandn %xmm8, %xmm13 -; SSSE3-NEXT: por %xmm14, %xmm13 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm14 -; SSSE3-NEXT: pand %xmm14, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm14 -; SSSE3-NEXT: por %xmm3, %xmm14 -; SSSE3-NEXT: movdqa %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm6, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 +; SSSE3-NEXT: movdqa %xmm8, %xmm14 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm15 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3] +; SSSE3-NEXT: por %xmm15, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: pandn %xmm6, %xmm4 ; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm15, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: movdqa %xmm6, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm15 -; SSSE3-NEXT: pandn %xmm8, %xmm7 -; SSSE3-NEXT: por %xmm15, %xmm7 -; SSSE3-NEXT: movdqa %xmm12, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: movdqa %xmm6, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm15 -; SSSE3-NEXT: pand %xmm15, %xmm12 -; SSSE3-NEXT: pandn %xmm8, %xmm15 -; SSSE3-NEXT: por %xmm12, %xmm15 -; SSSE3-NEXT: movdqa %xmm11, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm6, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm12 +; SSSE3-NEXT: movdqa %xmm10, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 +; SSSE3-NEXT: movdqa %xmm8, %xmm14 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm15 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3] +; SSSE3-NEXT: por %xmm15, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm10 +; SSSE3-NEXT: pandn %xmm6, %xmm5 +; SSSE3-NEXT: por %xmm10, %xmm5 +; SSSE3-NEXT: movdqa %xmm12, %xmm10 +; SSSE3-NEXT: pxor %xmm1, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 +; SSSE3-NEXT: movdqa %xmm8, %xmm14 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm15 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm14[1,1,3,3] +; SSSE3-NEXT: por %xmm15, %xmm10 +; SSSE3-NEXT: pand %xmm10, %xmm12 +; SSSE3-NEXT: pandn %xmm6, %xmm10 +; SSSE3-NEXT: por %xmm12, %xmm10 +; SSSE3-NEXT: movdqa %xmm11, %xmm12 +; SSSE3-NEXT: pxor %xmm1, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 +; SSSE3-NEXT: movdqa %xmm8, %xmm14 +; SSSE3-NEXT: pcmpgtd %xmm12, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm15 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,3,3] +; SSSE3-NEXT: por %xmm15, %xmm12 ; SSSE3-NEXT: pand %xmm12, %xmm11 -; SSSE3-NEXT: pandn %xmm8, %xmm12 +; SSSE3-NEXT: pandn %xmm6, %xmm12 ; SSSE3-NEXT: por %xmm11, %xmm12 -; SSSE3-NEXT: movdqa %xmm9, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm9 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm9, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm7, %xmm11 +; SSSE3-NEXT: pxor %xmm1, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 +; SSSE3-NEXT: movdqa %xmm8, %xmm14 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm15 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3] +; SSSE3-NEXT: por %xmm15, %xmm11 +; SSSE3-NEXT: pand %xmm11, %xmm7 +; SSSE3-NEXT: pandn %xmm6, %xmm11 +; SSSE3-NEXT: por %xmm7, %xmm11 +; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: pxor %xmm1, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pandn %xmm6, %xmm8 +; SSSE3-NEXT: por %xmm0, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pand %xmm7, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm6 -; SSSE3-NEXT: pand %xmm2, %xmm6 -; SSSE3-NEXT: movdqa %xmm12, %xmm0 +; SSSE3-NEXT: pand %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm11, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: pand %xmm12, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm0 +; SSSE3-NEXT: pand %xmm11, %xmm0 ; SSSE3-NEXT: packuswb %xmm6, %xmm0 -; SSSE3-NEXT: movdqa %xmm15, %xmm2 -; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm15, %xmm3 -; SSSE3-NEXT: movdqa %xmm7, %xmm2 -; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSSE3-NEXT: movdqa %xmm12, %xmm6 +; SSSE3-NEXT: pxor %xmm1, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm7 +; SSSE3-NEXT: pand %xmm12, %xmm7 +; SSSE3-NEXT: movdqa %xmm10, %xmm6 +; SSSE3-NEXT: pxor %xmm1, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm8 +; SSSE3-NEXT: pand %xmm10, %xmm8 +; SSSE3-NEXT: packuswb %xmm7, %xmm8 +; SSSE3-NEXT: packuswb %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm1, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm7 +; SSSE3-NEXT: pand %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm5 +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: packuswb %xmm7, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: pand %xmm7, %xmm6 -; SSSE3-NEXT: packuswb %xmm3, %xmm6 -; SSSE3-NEXT: packuswb %xmm6, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: por %xmm4, %xmm6 +; SSSE3-NEXT: pand %xmm3, %xmm6 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm2 -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm14, %xmm3 ; SSSE3-NEXT: pxor %xmm1, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm14, %xmm4 -; SSSE3-NEXT: movdqa %xmm13, %xmm3 -; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm13, %xmm3 -; SSSE3-NEXT: packuswb %xmm4, %xmm3 -; SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: packuswb %xmm6, %xmm3 +; SSSE3-NEXT: packuswb %xmm3, %xmm5 +; SSSE3-NEXT: packuswb %xmm5, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v16i64_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm10 -; SSE41-NEXT: movdqa 16(%rdi), %xmm9 -; SSE41-NEXT: movdqa 32(%rdi), %xmm14 -; SSE41-NEXT: movdqa 48(%rdi), %xmm12 -; SSE41-NEXT: movdqa 80(%rdi), %xmm15 +; SSE41-NEXT: movdqa (%rdi), %xmm8 +; SSE41-NEXT: movdqa 16(%rdi), %xmm7 +; SSE41-NEXT: movdqa 32(%rdi), %xmm12 +; SSE41-NEXT: movdqa 48(%rdi), %xmm11 +; SSE41-NEXT: movdqa 80(%rdi), %xmm10 ; SSE41-NEXT: movdqa 64(%rdi), %xmm6 -; SSE41-NEXT: movdqa 112(%rdi), %xmm13 +; SSE41-NEXT: movdqa 112(%rdi), %xmm5 ; SSE41-NEXT: movdqa 96(%rdi), %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm7, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm9, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 -; SSE41-NEXT: movdqa %xmm13, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm11 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm13 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm13 -; SSE41-NEXT: movdqa %xmm15, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm6 -; SSE41-NEXT: movdqa %xmm14, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm15 -; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm15 -; SSE41-NEXT: movdqa %xmm12, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm4 +; SSE41-NEXT: movdqa %xmm9, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm4 -; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm5 +; SSE41-NEXT: movdqa %xmm9, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm7, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5 -; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 +; SSE41-NEXT: movdqa %xmm10, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm9 -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: xorpd %xmm2, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movapd %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm9, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6 +; SSE41-NEXT: movdqa %xmm12, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm10 +; SSE41-NEXT: movdqa %xmm11, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm12, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: xorpd %xmm7, %xmm7 +; SSE41-NEXT: movapd %xmm1, %xmm8 +; SSE41-NEXT: xorpd %xmm2, %xmm8 +; SSE41-NEXT: movapd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm8, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE41-NEXT: movapd %xmm11, %xmm1 ; SSE41-NEXT: xorpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 +; SSE41-NEXT: movapd %xmm1, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: packusdw %xmm3, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm3 -; SSE41-NEXT: xorpd %xmm2, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movapd %xmm15, %xmm4 -; SSE41-NEXT: xorpd %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1 +; SSE41-NEXT: packusdw %xmm8, %xmm1 +; SSE41-NEXT: movapd %xmm12, %xmm8 +; SSE41-NEXT: xorpd %xmm2, %xmm8 +; SSE41-NEXT: movapd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm8, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8 +; SSE41-NEXT: movapd %xmm10, %xmm9 +; SSE41-NEXT: xorpd %xmm2, %xmm9 +; SSE41-NEXT: movapd %xmm9, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm9 +; SSE41-NEXT: packusdw %xmm8, %xmm9 +; SSE41-NEXT: packusdw %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm6, %xmm8 +; SSE41-NEXT: xorpd %xmm2, %xmm8 +; SSE41-NEXT: movapd %xmm8, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm8, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 +; SSE41-NEXT: movapd %xmm5, %xmm6 +; SSE41-NEXT: xorpd %xmm2, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 +; SSE41-NEXT: packusdw %xmm8, %xmm6 ; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm4 -; SSE41-NEXT: packusdw %xmm3, %xmm4 -; SSE41-NEXT: packusdw %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm6, %xmm3 -; SSE41-NEXT: xorpd %xmm2, %xmm3 +; SSE41-NEXT: xorpd %xmm2, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5 ; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4 -; SSE41-NEXT: movapd %xmm13, %xmm3 -; SSE41-NEXT: xorpd %xmm2, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm3 -; SSE41-NEXT: packusdw %xmm4, %xmm3 -; SSE41-NEXT: movapd %xmm11, %xmm4 ; SSE41-NEXT: xorpd %xmm2, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE41-NEXT: movapd %xmm4, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm8 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4 -; SSE41-NEXT: movapd %xmm8, %xmm5 -; SSE41-NEXT: xorpd %xmm2, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 -; SSE41-NEXT: packusdw %xmm4, %xmm9 -; SSE41-NEXT: packusdw %xmm9, %xmm3 -; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE41-NEXT: packusdw %xmm5, %xmm7 +; SSE41-NEXT: packusdw %xmm7, %xmm6 +; SSE41-NEXT: packuswb %xmm6, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -4964,10 +4964,10 @@ ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm8 +; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm9 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm3 @@ -4977,39 +4977,39 @@ ; AVX1-NEXT: vmovdqa (%rdi), %xmm5 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm7, %xmm2, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm5, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm7, %xmm2, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm2, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm8, %xmm2, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm2, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm7 -; AVX1-NEXT: vpand %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm7 -; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm5 -; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm5 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm9, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 ; AVX1-NEXT: vpand %xmm2, %xmm9, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 -; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm9 +; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm5 +; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm5 +; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm4 +; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v16i64_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -309,122 +309,122 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; SSE2-LABEL: trunc_ssat_v4i64_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] ; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm5 ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] ; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v4i64_v4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pxor %xmm2, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; SSSE3-NEXT: pxor %xmm6, %xmm6 ; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; SSSE3-NEXT: movdqa %xmm3, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] ; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: pandn %xmm3, %xmm5 ; SSSE3-NEXT: por %xmm5, %xmm0 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pxor %xmm2, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] ; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm3, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm5 +; SSSE3-NEXT: pandn %xmm1, %xmm3 +; SSSE3-NEXT: por %xmm5, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v4i64_v4i32: @@ -580,325 +580,325 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm3 ; SSE2-NEXT: movdqa 16(%rdi), %xmm5 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 -; SSE2-NEXT: movdqa 48(%rdi), %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; SSE2-NEXT: movdqa 32(%rdi), %xmm7 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm11, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm11 -; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] ; SSE2-NEXT: por %xmm11, %xmm5 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm10 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm10, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm7 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm7, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: pxor %xmm0, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm8 +; SSE2-NEXT: pandn %xmm4, %xmm9 +; SSE2-NEXT: por %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm5, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: por %xmm10, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm3, %xmm5 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm3, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v8i64_v8i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa (%rdi), %xmm3 ; SSSE3-NEXT: movdqa 16(%rdi), %xmm5 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm11 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm10 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; SSSE3-NEXT: movdqa 32(%rdi), %xmm7 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa %xmm6, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSSE3-NEXT: por %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm6, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm11, %xmm4 -; SSSE3-NEXT: pxor %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm11 -; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: movdqa %xmm7, %xmm5 +; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm6, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] ; SSSE3-NEXT: por %xmm11, %xmm5 -; SSSE3-NEXT: movdqa %xmm10, %xmm4 -; SSSE3-NEXT: pxor %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm10 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm10, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [18446744069414584320,18446744069414584320] -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pand %xmm5, %xmm7 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm7, %xmm5 +; SSSE3-NEXT: movdqa %xmm1, %xmm7 +; SSSE3-NEXT: pxor %xmm0, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm1 +; SSSE3-NEXT: pandn %xmm4, %xmm8 +; SSSE3-NEXT: por %xmm1, %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] +; SSSE3-NEXT: movdqa %xmm8, %xmm1 +; SSSE3-NEXT: pxor %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm8 +; SSSE3-NEXT: pandn %xmm4, %xmm9 +; SSSE3-NEXT: por %xmm8, %xmm9 ; SSSE3-NEXT: movdqa %xmm5, %xmm1 ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm10 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: por %xmm10, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: pandn %xmm4, %xmm1 ; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm3, %xmm5 ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSSE3-NEXT: pand %xmm3, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: por %xmm6, %xmm0 ; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm5 -; SSE41-NEXT: movdqa 16(%rdi), %xmm4 -; SSE41-NEXT: movdqa 32(%rdi), %xmm10 -; SSE41-NEXT: movdqa 48(%rdi), %xmm9 +; SSE41-NEXT: movdqa 16(%rdi), %xmm8 +; SSE41-NEXT: movdqa 32(%rdi), %xmm7 +; SSE41-NEXT: movdqa 48(%rdi), %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [2147483647,2147483647] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; SSE41-NEXT: movdqa %xmm2, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm8 -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm11 -; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 +; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; SSE41-NEXT: movapd %xmm1, %xmm7 ; SSE41-NEXT: xorpd %xmm3, %xmm7 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] -; SSE41-NEXT: movapd %xmm7, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE41-NEXT: movapd %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: movapd %xmm8, %xmm1 ; SSE41-NEXT: xorpd %xmm3, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: movapd %xmm1, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; SSE41-NEXT: movapd %xmm11, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4 -; SSE41-NEXT: xorpd %xmm8, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] +; SSE41-NEXT: movapd %xmm5, %xmm7 +; SSE41-NEXT: xorpd %xmm3, %xmm7 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 +; SSE41-NEXT: xorpd %xmm4, %xmm3 ; SSE41-NEXT: movapd %xmm3, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2] ; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -1360,123 +1360,123 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) { ; SSE2-LABEL: trunc_ssat_v4i64_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32767,32767] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] ; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm5 ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] ; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: packssdw %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: packssdw %xmm3, %xmm0 ; SSE2-NEXT: packssdw %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v4i64_v4i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [32767,32767] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pxor %xmm2, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; SSSE3-NEXT: pxor %xmm6, %xmm6 ; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147516415,2147516415] -; SSSE3-NEXT: movdqa %xmm3, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] ; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: pandn %xmm3, %xmm5 ; SSSE3-NEXT: por %xmm5, %xmm0 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pxor %xmm2, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] ; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm3, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm5 +; SSSE3-NEXT: pandn %xmm1, %xmm3 +; SSSE3-NEXT: por %xmm5, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: packssdw %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: packssdw %xmm3, %xmm0 ; SSSE3-NEXT: packssdw %xmm0, %xmm0 ; SSSE3-NEXT: retq ; @@ -1604,126 +1604,126 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE2-LABEL: trunc_ssat_v4i64_v4i16_store: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32767,32767] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848] -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709518848,18446744073709518848] +; SSE2-NEXT: movdqa %xmm5, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] ; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: por %xmm8, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: packssdw %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: packssdw %xmm1, %xmm2 +; SSE2-NEXT: packssdw %xmm2, %xmm2 +; SSE2-NEXT: movq %xmm2, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v4i64_v4i16_store: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [32767,32767] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147516415,2147516415] -; SSSE3-NEXT: movdqa %xmm4, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pand %xmm5, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848] -; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709518848,18446744073709518848] +; SSSE3-NEXT: movdqa %xmm5, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm8 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm8, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm5 +; SSSE3-NEXT: pandn %xmm0, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm1 ; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: packssdw %xmm1, %xmm0 -; SSSE3-NEXT: packssdw %xmm0, %xmm0 -; SSSE3-NEXT: movq %xmm0, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm0, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: packssdw %xmm1, %xmm2 +; SSSE3-NEXT: packssdw %xmm2, %xmm2 +; SSSE3-NEXT: movq %xmm2, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v4i64_v4i16_store: @@ -1855,329 +1855,329 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-LABEL: trunc_ssat_v8i64_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm11 -; SSE2-NEXT: movdqa 16(%rdi), %xmm10 +; SSE2-NEXT: movdqa (%rdi), %xmm6 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm3 ; SSE2-NEXT: movdqa 48(%rdi), %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32767,32767] ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm11, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm11 -; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] ; SSE2-NEXT: por %xmm11, %xmm5 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848] +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm8 +; SSE2-NEXT: pandn %xmm4, %xmm9 +; SSE2-NEXT: por %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm5, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: packssdw %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: packssdw %xmm9, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm3, %xmm5 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE2-NEXT: pand %xmm3, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: packssdw %xmm4, %xmm1 +; SSE2-NEXT: packssdw %xmm5, %xmm1 ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v8i64_v8i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm11 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm10 +; SSSE3-NEXT: movdqa (%rdi), %xmm6 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm3 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [32767,32767] ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147516415,2147516415] -; SSSE3-NEXT: movdqa %xmm0, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] +; SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSSE3-NEXT: por %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm11, %xmm4 -; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm11 -; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] ; SSSE3-NEXT: por %xmm11, %xmm5 -; SSSE3-NEXT: movdqa %xmm10, %xmm4 -; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm10 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm10, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848] -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562035200,18446744071562035200] -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm5, %xmm6 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSSE3-NEXT: pxor %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm8 +; SSSE3-NEXT: por %xmm0, %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848] +; SSSE3-NEXT: movdqa %xmm8, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm8 +; SSSE3-NEXT: pandn %xmm4, %xmm9 +; SSSE3-NEXT: por %xmm8, %xmm9 ; SSSE3-NEXT: movdqa %xmm5, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm10 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: por %xmm10, %xmm0 ; SSSE3-NEXT: pand %xmm0, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: packssdw %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: packssdw %xmm9, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm3, %xmm5 ; SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSSE3-NEXT: pand %xmm3, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: por %xmm6, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: pandn %xmm4, %xmm1 ; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: packssdw %xmm4, %xmm1 +; SSSE3-NEXT: packssdw %xmm5, %xmm1 ; SSSE3-NEXT: packssdw %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm10 -; SSE41-NEXT: movdqa 16(%rdi), %xmm9 -; SSE41-NEXT: movdqa 32(%rdi), %xmm3 -; SSE41-NEXT: movdqa 48(%rdi), %xmm5 +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm5 +; SSE41-NEXT: movdqa 32(%rdi), %xmm4 +; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [32767,32767] ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147516415,2147516415] +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm11 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709518848,18446744073709518848] -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: xorpd %xmm2, %xmm4 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: xorpd %xmm2, %xmm7 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562035200,18446744071562035200] -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: movapd %xmm8, %xmm1 ; SSE41-NEXT: xorpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: movapd %xmm1, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm5, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: packssdw %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm11, %xmm3 -; SSE41-NEXT: xorpd %xmm2, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 -; SSE41-NEXT: xorpd %xmm8, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: packssdw %xmm7, %xmm1 +; SSE41-NEXT: movapd %xmm4, %xmm7 +; SSE41-NEXT: xorpd %xmm2, %xmm7 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 +; SSE41-NEXT: xorpd %xmm3, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: packssdw %xmm3, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: packssdw %xmm7, %xmm5 ; SSE41-NEXT: packssdw %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -2739,36 +2739,36 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) { ; SSE2-LABEL: trunc_ssat_v4i64_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [127,127] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] ; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] @@ -2776,64 +2776,64 @@ ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] ; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] +; SSE2-NEXT: pand %xmm5, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: por %xmm8, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v4i64_v4i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [127,127] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483775,2147483775] -; SSSE3-NEXT: movdqa %xmm4, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pand %xmm5, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] ; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] ; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pxor %xmm2, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] @@ -2841,28 +2841,28 @@ ; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] +; SSSE3-NEXT: pand %xmm5, %xmm8 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: por %xmm8, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: pandn %xmm1, %xmm4 ; SSSE3-NEXT: por %xmm4, %xmm0 ; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm3, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v4i64_v4i8: @@ -2996,65 +2996,65 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE2-LABEL: trunc_ssat_v4i64_v4i8_store: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [127,127] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] ; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm5, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] ; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, (%rdi) @@ -3062,65 +3062,65 @@ ; ; SSSE3-LABEL: trunc_ssat_v4i64_v4i8_store: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [127,127] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483775,2147483775] -; SSSE3-NEXT: movdqa %xmm4, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pand %xmm5, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] ; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] -; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709551488,18446744073709551488] +; SSSE3-NEXT: movdqa %xmm5, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm8 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm8, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm5 +; SSSE3-NEXT: pandn %xmm0, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm1 ; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm0, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSSE3-NEXT: movd %xmm1, (%rdi) ; SSSE3-NEXT: retq ; @@ -3260,331 +3260,331 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-LABEL: trunc_ssat_v8i64_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm11 -; SSE2-NEXT: movdqa 16(%rdi), %xmm10 +; SSE2-NEXT: movdqa (%rdi), %xmm6 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm3 ; SSE2-NEXT: movdqa 48(%rdi), %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [127,127] ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm11, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm11 -; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] ; SSE2-NEXT: por %xmm11, %xmm5 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm8 +; SSE2-NEXT: pandn %xmm4, %xmm9 +; SSE2-NEXT: por %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm5, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: packssdw %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: packssdw %xmm9, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm3, %xmm5 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE2-NEXT: pand %xmm3, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: packssdw %xmm4, %xmm1 +; SSE2-NEXT: packssdw %xmm5, %xmm1 ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v8i64_v8i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm11 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm10 +; SSSE3-NEXT: movdqa (%rdi), %xmm6 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm3 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [127,127] ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483775,2147483775] -; SSSE3-NEXT: movdqa %xmm0, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] +; SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSSE3-NEXT: por %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm11, %xmm4 -; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm11 -; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] ; SSSE3-NEXT: por %xmm11, %xmm5 -; SSSE3-NEXT: movdqa %xmm10, %xmm4 -; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm10 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm10, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840] -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm5, %xmm6 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSSE3-NEXT: pxor %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm8 +; SSSE3-NEXT: por %xmm0, %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] +; SSSE3-NEXT: movdqa %xmm8, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm8 +; SSSE3-NEXT: pandn %xmm4, %xmm9 +; SSSE3-NEXT: por %xmm8, %xmm9 ; SSSE3-NEXT: movdqa %xmm5, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm10 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: por %xmm10, %xmm0 ; SSSE3-NEXT: pand %xmm0, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: packssdw %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: packssdw %xmm9, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm3, %xmm5 ; SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSSE3-NEXT: pand %xmm3, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: por %xmm6, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: pandn %xmm4, %xmm1 ; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: packssdw %xmm4, %xmm1 +; SSSE3-NEXT: packssdw %xmm5, %xmm1 ; SSSE3-NEXT: packssdw %xmm1, %xmm0 ; SSSE3-NEXT: packsswb %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm10 -; SSE41-NEXT: movdqa 16(%rdi), %xmm9 -; SSE41-NEXT: movdqa 32(%rdi), %xmm3 -; SSE41-NEXT: movdqa 48(%rdi), %xmm5 +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm5 +; SSE41-NEXT: movdqa 32(%rdi), %xmm4 +; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [127,127] ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm11 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709551488,18446744073709551488] -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: xorpd %xmm2, %xmm4 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: xorpd %xmm2, %xmm7 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: movapd %xmm8, %xmm1 ; SSE41-NEXT: xorpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: movapd %xmm1, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm5, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: packssdw %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm11, %xmm3 -; SSE41-NEXT: xorpd %xmm2, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 -; SSE41-NEXT: xorpd %xmm8, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: packssdw %xmm7, %xmm1 +; SSE41-NEXT: movapd %xmm4, %xmm7 +; SSE41-NEXT: xorpd %xmm2, %xmm7 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 +; SSE41-NEXT: xorpd %xmm3, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: packssdw %xmm3, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: packssdw %xmm7, %xmm5 ; SSE41-NEXT: packssdw %xmm5, %xmm1 ; SSE41-NEXT: packsswb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -3672,117 +3672,117 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-width"="256" { ; SSE2-LABEL: trunc_ssat_v8i64_v8i8_store: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm11 -; SSE2-NEXT: movdqa 16(%rdi), %xmm10 +; SSE2-NEXT: movdqa (%rdi), %xmm6 +; SSE2-NEXT: movdqa 16(%rdi), %xmm3 ; SSE2-NEXT: movdqa 32(%rdi), %xmm2 ; SSE2-NEXT: movdqa 48(%rdi), %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [127,127] ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm12 -; SSE2-NEXT: por %xmm2, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm5, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm2 ; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm11 -; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] ; SSE2-NEXT: por %xmm11, %xmm5 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm10 -; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: por %xmm3, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pandn %xmm3, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm8 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pxor %xmm0, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm10, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] -; SSE2-NEXT: movdqa %xmm7, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm7 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: pandn %xmm3, %xmm7 ; SSE2-NEXT: por %xmm5, %xmm7 -; SSE2-NEXT: packssdw %xmm3, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm12, %xmm0 +; SSE2-NEXT: packssdw %xmm8, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm12 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm12, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: packssdw %xmm5, %xmm0 ; SSE2-NEXT: packssdw %xmm0, %xmm7 ; SSE2-NEXT: packsswb %xmm7, %xmm7 ; SSE2-NEXT: movq %xmm7, (%rsi) @@ -3790,117 +3790,117 @@ ; ; SSSE3-LABEL: trunc_ssat_v8i64_v8i8_store: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm11 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm10 +; SSSE3-NEXT: movdqa (%rdi), %xmm6 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm3 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm2 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [127,127] ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483775,2147483775] -; SSSE3-NEXT: movdqa %xmm3, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm12 -; SSSE3-NEXT: pand %xmm12, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm12 -; SSSE3-NEXT: por %xmm2, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] +; SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm2, %xmm1 ; SSSE3-NEXT: movdqa %xmm5, %xmm2 ; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: movdqa %xmm11, %xmm4 -; SSSE3-NEXT: pxor %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm11 -; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm5 +; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] ; SSSE3-NEXT: por %xmm11, %xmm5 -; SSSE3-NEXT: movdqa %xmm10, %xmm4 -; SSSE3-NEXT: pxor %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm10 -; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: pand %xmm5, %xmm6 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: pxor %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSSE3-NEXT: por %xmm3, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pxor %xmm0, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: pandn %xmm3, %xmm8 +; SSSE3-NEXT: por %xmm7, %xmm8 +; SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSSE3-NEXT: pxor %xmm0, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSSE3-NEXT: por %xmm10, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] -; SSSE3-NEXT: movdqa %xmm7, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840] -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm7 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm7, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm6, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm7 ; SSSE3-NEXT: pand %xmm7, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: pandn %xmm3, %xmm7 ; SSSE3-NEXT: por %xmm5, %xmm7 -; SSSE3-NEXT: packssdw %xmm3, %xmm7 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm12, %xmm0 +; SSSE3-NEXT: packssdw %xmm8, %xmm7 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm3, %xmm5 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSSE3-NEXT: pand %xmm2, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm12 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm12, %xmm0 -; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm3, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: packssdw %xmm5, %xmm0 ; SSSE3-NEXT: packssdw %xmm0, %xmm7 ; SSSE3-NEXT: packsswb %xmm7, %xmm7 ; SSSE3-NEXT: movq %xmm7, (%rsi) @@ -3908,97 +3908,97 @@ ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i8_store: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm10 -; SSE41-NEXT: movdqa 16(%rdi), %xmm9 -; SSE41-NEXT: movdqa 32(%rdi), %xmm2 -; SSE41-NEXT: movdqa 48(%rdi), %xmm5 +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm5 +; SSE41-NEXT: movdqa 32(%rdi), %xmm3 +; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm3, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm11 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709551488,18446744073709551488] -; SSE41-NEXT: movapd %xmm4, %xmm3 -; SSE41-NEXT: xorpd %xmm1, %xmm3 +; SSE41-NEXT: movapd %xmm4, %xmm7 +; SSE41-NEXT: xorpd %xmm1, %xmm7 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] -; SSE41-NEXT: movapd %xmm3, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 +; SSE41-NEXT: movapd %xmm8, %xmm4 ; SSE41-NEXT: xorpd %xmm1, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: movapd %xmm4, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: packssdw %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm2 -; SSE41-NEXT: xorpd %xmm1, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm2 -; SSE41-NEXT: xorpd %xmm8, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: packssdw %xmm7, %xmm4 +; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: xorpd %xmm1, %xmm7 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE41-NEXT: xorpd %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: packssdw %xmm2, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: packssdw %xmm7, %xmm5 ; SSE41-NEXT: packssdw %xmm5, %xmm4 ; SSE41-NEXT: packsswb %xmm4, %xmm4 ; SSE41-NEXT: movq %xmm4, (%rsi) @@ -4088,638 +4088,638 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-LABEL: trunc_ssat_v16i64_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm11 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm14 -; SSE2-NEXT: movdqa 48(%rdi), %xmm12 +; SSE2-NEXT: movdqa (%rdi), %xmm8 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa 32(%rdi), %xmm12 +; SSE2-NEXT: movdqa 48(%rdi), %xmm11 ; SSE2-NEXT: movdqa 80(%rdi), %xmm7 -; SSE2-NEXT: movdqa 64(%rdi), %xmm2 +; SSE2-NEXT: movdqa 64(%rdi), %xmm5 ; SSE2-NEXT: movdqa 112(%rdi), %xmm4 ; SSE2-NEXT: movdqa 96(%rdi), %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127] -; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm15, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [127,127] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] ; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm13 -; SSE2-NEXT: pand %xmm13, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm13 -; SSE2-NEXT: por %xmm3, %xmm13 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm15, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm9, %xmm14 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm15 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,3,3] +; SSE2-NEXT: por %xmm15, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm6, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 +; SSE2-NEXT: movdqa %xmm9, %xmm14 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm15 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,3,3] +; SSE2-NEXT: por %xmm15, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm6, %xmm3 ; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm15, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: pxor %xmm15, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 +; SSE2-NEXT: movdqa %xmm9, %xmm14 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm15 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3] +; SSE2-NEXT: por %xmm15, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 +; SSE2-NEXT: movdqa %xmm9, %xmm14 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm15 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3] +; SSE2-NEXT: por %xmm15, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm7 -; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm7, %xmm5 -; SSE2-NEXT: movdqa %xmm14, %xmm0 -; SSE2-NEXT: pxor %xmm15, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm14 -; SSE2-NEXT: pandn %xmm8, %xmm7 -; SSE2-NEXT: por %xmm14, %xmm7 -; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: pxor %xmm15, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm14 -; SSE2-NEXT: pand %xmm14, %xmm12 -; SSE2-NEXT: pandn %xmm8, %xmm14 -; SSE2-NEXT: por %xmm12, %xmm14 -; SSE2-NEXT: movdqa %xmm11, %xmm0 -; SSE2-NEXT: pxor %xmm15, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm12 +; SSE2-NEXT: movdqa %xmm12, %xmm7 +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 +; SSE2-NEXT: movdqa %xmm9, %xmm14 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm15 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm14[1,1,3,3] +; SSE2-NEXT: por %xmm15, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm12 +; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: por %xmm12, %xmm7 +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 +; SSE2-NEXT: movdqa %xmm9, %xmm14 +; SSE2-NEXT: pcmpgtd %xmm12, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm15 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,3,3] +; SSE2-NEXT: por %xmm15, %xmm12 ; SSE2-NEXT: pand %xmm12, %xmm11 -; SSE2-NEXT: pandn %xmm8, %xmm12 +; SSE2-NEXT: pandn %xmm6, %xmm12 ; SSE2-NEXT: por %xmm11, %xmm12 -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: pxor %xmm15, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm9 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm9, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm15, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: pxor %xmm15, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm8, %xmm11 +; SSE2-NEXT: pxor %xmm1, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 +; SSE2-NEXT: movdqa %xmm9, %xmm14 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm15 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3] +; SSE2-NEXT: por %xmm15, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm8 +; SSE2-NEXT: pandn %xmm6, %xmm11 +; SSE2-NEXT: por %xmm8, %xmm11 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm10 +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm13 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm13 +; SSE2-NEXT: pand %xmm13, %xmm10 +; SSE2-NEXT: pandn %xmm6, %xmm13 +; SSE2-NEXT: por %xmm10, %xmm13 +; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSE2-NEXT: pand %xmm10, %xmm14 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm12 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm12, %xmm0 -; SSE2-NEXT: packssdw %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm14, %xmm1 -; SSE2-NEXT: pxor %xmm15, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm14 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm14, %xmm1 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm15, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm7 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: packssdw %xmm1, %xmm2 -; SSE2-NEXT: packssdw %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm15, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm15, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: packssdw %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm15, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: por %xmm14, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm11 +; SSE2-NEXT: pandn %xmm6, %xmm0 +; SSE2-NEXT: por %xmm11, %xmm0 +; SSE2-NEXT: packssdw %xmm13, %xmm0 +; SSE2-NEXT: movdqa %xmm12, %xmm10 +; SSE2-NEXT: pxor %xmm1, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm12 +; SSE2-NEXT: pandn %xmm6, %xmm10 +; SSE2-NEXT: por %xmm12, %xmm10 +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: pxor %xmm1, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm7 +; SSE2-NEXT: pandn %xmm6, %xmm11 +; SSE2-NEXT: por %xmm7, %xmm11 +; SSE2-NEXT: packssdw %xmm10, %xmm11 +; SSE2-NEXT: packssdw %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: por %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,0,2,2] +; SSE2-NEXT: pand %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: packssdw %xmm7, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] +; SSE2-NEXT: pand %xmm3, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm13, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm13 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm13, %xmm3 -; SSE2-NEXT: packssdw %xmm1, %xmm3 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: packsswb %xmm2, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm4, %xmm1 +; SSE2-NEXT: packssdw %xmm1, %xmm5 +; SSE2-NEXT: packsswb %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v16i64_v16i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm11 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm9 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm14 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm12 +; SSSE3-NEXT: movdqa (%rdi), %xmm8 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 +; SSSE3-NEXT: movdqa 32(%rdi), %xmm12 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm11 ; SSSE3-NEXT: movdqa 80(%rdi), %xmm7 -; SSSE3-NEXT: movdqa 64(%rdi), %xmm2 +; SSSE3-NEXT: movdqa 64(%rdi), %xmm5 ; SSSE3-NEXT: movdqa 112(%rdi), %xmm4 ; SSSE3-NEXT: movdqa 96(%rdi), %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127] -; SSSE3-NEXT: movdqa {{.*#+}} xmm15 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm15, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [127,127] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] ; SSSE3-NEXT: pxor %xmm10, %xmm10 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483775,2147483775] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm6, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm13 -; SSSE3-NEXT: pand %xmm13, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm13 -; SSSE3-NEXT: por %xmm3, %xmm13 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm15, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] +; SSSE3-NEXT: movdqa %xmm9, %xmm14 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm15 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,3,3] +; SSSE3-NEXT: por %xmm15, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm6, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 +; SSSE3-NEXT: movdqa %xmm9, %xmm14 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm15 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,3,3] +; SSSE3-NEXT: por %xmm15, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: pandn %xmm6, %xmm3 ; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm15, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm7, %xmm0 -; SSSE3-NEXT: pxor %xmm15, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pxor %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 +; SSSE3-NEXT: movdqa %xmm9, %xmm14 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm15 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3] +; SSSE3-NEXT: por %xmm15, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: pandn %xmm6, %xmm4 +; SSSE3-NEXT: por %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm7, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 +; SSSE3-NEXT: movdqa %xmm9, %xmm14 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm15 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3] +; SSSE3-NEXT: por %xmm15, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm7 -; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: pandn %xmm6, %xmm5 ; SSSE3-NEXT: por %xmm7, %xmm5 -; SSSE3-NEXT: movdqa %xmm14, %xmm0 -; SSSE3-NEXT: pxor %xmm15, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm14 -; SSSE3-NEXT: pandn %xmm8, %xmm7 -; SSSE3-NEXT: por %xmm14, %xmm7 -; SSSE3-NEXT: movdqa %xmm12, %xmm0 -; SSSE3-NEXT: pxor %xmm15, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm14 -; SSSE3-NEXT: pand %xmm14, %xmm12 -; SSSE3-NEXT: pandn %xmm8, %xmm14 -; SSSE3-NEXT: por %xmm12, %xmm14 -; SSSE3-NEXT: movdqa %xmm11, %xmm0 -; SSSE3-NEXT: pxor %xmm15, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm12 +; SSSE3-NEXT: movdqa %xmm12, %xmm7 +; SSSE3-NEXT: pxor %xmm1, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 +; SSSE3-NEXT: movdqa %xmm9, %xmm14 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm15 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm14[1,1,3,3] +; SSSE3-NEXT: por %xmm15, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm12 +; SSSE3-NEXT: pandn %xmm6, %xmm7 +; SSSE3-NEXT: por %xmm12, %xmm7 +; SSSE3-NEXT: movdqa %xmm11, %xmm12 +; SSSE3-NEXT: pxor %xmm1, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 +; SSSE3-NEXT: movdqa %xmm9, %xmm14 +; SSSE3-NEXT: pcmpgtd %xmm12, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm15 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,3,3] +; SSSE3-NEXT: por %xmm15, %xmm12 ; SSSE3-NEXT: pand %xmm12, %xmm11 -; SSSE3-NEXT: pandn %xmm8, %xmm12 +; SSSE3-NEXT: pandn %xmm6, %xmm12 ; SSSE3-NEXT: por %xmm11, %xmm12 -; SSSE3-NEXT: movdqa %xmm9, %xmm0 -; SSSE3-NEXT: pxor %xmm15, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm9 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm9, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm15, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm11 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840] -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm11, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm12, %xmm0 -; SSSE3-NEXT: pxor %xmm15, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm1, %xmm6 +; SSSE3-NEXT: movdqa %xmm8, %xmm11 +; SSSE3-NEXT: pxor %xmm1, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 +; SSSE3-NEXT: movdqa %xmm9, %xmm14 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm15 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3] +; SSSE3-NEXT: por %xmm15, %xmm11 +; SSSE3-NEXT: pand %xmm11, %xmm8 +; SSSE3-NEXT: pandn %xmm6, %xmm11 +; SSSE3-NEXT: por %xmm8, %xmm11 +; SSSE3-NEXT: movdqa %xmm0, %xmm8 +; SSSE3-NEXT: pxor %xmm1, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm10 +; SSSE3-NEXT: pand %xmm10, %xmm0 +; SSSE3-NEXT: pandn %xmm6, %xmm10 +; SSSE3-NEXT: por %xmm0, %xmm10 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] +; SSSE3-NEXT: movdqa %xmm10, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8 +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm13 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm13 +; SSSE3-NEXT: pand %xmm13, %xmm10 +; SSSE3-NEXT: pandn %xmm6, %xmm13 +; SSSE3-NEXT: por %xmm10, %xmm13 +; SSSE3-NEXT: movdqa %xmm11, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSSE3-NEXT: pand %xmm10, %xmm14 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm12 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm12, %xmm0 -; SSSE3-NEXT: packssdw %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm14, %xmm1 -; SSSE3-NEXT: pxor %xmm15, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm14 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm14, %xmm1 -; SSSE3-NEXT: movdqa %xmm7, %xmm2 -; SSSE3-NEXT: pxor %xmm15, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm11, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm7 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm7, %xmm2 -; SSSE3-NEXT: packssdw %xmm1, %xmm2 -; SSSE3-NEXT: packssdw %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pxor %xmm15, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm15, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm4, %xmm2 -; SSSE3-NEXT: packssdw %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm15, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: por %xmm14, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm11 +; SSSE3-NEXT: pandn %xmm6, %xmm0 +; SSSE3-NEXT: por %xmm11, %xmm0 +; SSSE3-NEXT: packssdw %xmm13, %xmm0 +; SSSE3-NEXT: movdqa %xmm12, %xmm10 +; SSSE3-NEXT: pxor %xmm1, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm11, %xmm13 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm13, %xmm10 +; SSSE3-NEXT: pand %xmm10, %xmm12 +; SSSE3-NEXT: pandn %xmm6, %xmm10 +; SSSE3-NEXT: por %xmm12, %xmm10 +; SSSE3-NEXT: movdqa %xmm7, %xmm11 +; SSSE3-NEXT: pxor %xmm1, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] +; SSSE3-NEXT: pand %xmm12, %xmm13 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSSE3-NEXT: por %xmm13, %xmm11 +; SSSE3-NEXT: pand %xmm11, %xmm7 +; SSSE3-NEXT: pandn %xmm6, %xmm11 +; SSSE3-NEXT: por %xmm7, %xmm11 +; SSSE3-NEXT: packssdw %xmm10, %xmm11 +; SSSE3-NEXT: packssdw %xmm11, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSSE3-NEXT: pxor %xmm1, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,0,2,2] +; SSSE3-NEXT: pand %xmm10, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm5 +; SSSE3-NEXT: pandn %xmm6, %xmm7 +; SSSE3-NEXT: por %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,0,2,2] +; SSSE3-NEXT: pand %xmm10, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm4 +; SSSE3-NEXT: pandn %xmm6, %xmm5 +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: packssdw %xmm7, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSSE3-NEXT: pand %xmm7, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pandn %xmm6, %xmm4 +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] +; SSSE3-NEXT: pand %xmm3, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm13, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm13 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm13, %xmm3 -; SSSE3-NEXT: packssdw %xmm1, %xmm3 -; SSSE3-NEXT: packssdw %xmm3, %xmm2 -; SSSE3-NEXT: packsswb %xmm2, %xmm0 +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pandn %xmm6, %xmm1 +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: packssdw %xmm4, %xmm1 +; SSSE3-NEXT: packssdw %xmm1, %xmm5 +; SSSE3-NEXT: packsswb %xmm5, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v16i64_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm11 -; SSE41-NEXT: movdqa 16(%rdi), %xmm9 -; SSE41-NEXT: movdqa 32(%rdi), %xmm15 -; SSE41-NEXT: movdqa 48(%rdi), %xmm12 -; SSE41-NEXT: movdqa 80(%rdi), %xmm4 -; SSE41-NEXT: movdqa 64(%rdi), %xmm14 -; SSE41-NEXT: movdqa 112(%rdi), %xmm13 -; SSE41-NEXT: movdqa 96(%rdi), %xmm3 +; SSE41-NEXT: movdqa (%rdi), %xmm8 +; SSE41-NEXT: movdqa 16(%rdi), %xmm7 +; SSE41-NEXT: movdqa 32(%rdi), %xmm12 +; SSE41-NEXT: movdqa 48(%rdi), %xmm11 +; SSE41-NEXT: movdqa 80(%rdi), %xmm10 +; SSE41-NEXT: movdqa 64(%rdi), %xmm6 +; SSE41-NEXT: movdqa 112(%rdi), %xmm5 +; SSE41-NEXT: movdqa 96(%rdi), %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [127,127] ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm7, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm7, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8 -; SSE41-NEXT: movdqa %xmm13, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm10 -; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm10 -; SSE41-NEXT: movdqa %xmm14, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm13 -; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm13 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm9, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm14 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm14 -; SSE41-NEXT: movdqa %xmm15, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 +; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6 ; SSE41-NEXT: movdqa %xmm12, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm15 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm15 +; SSE41-NEXT: movdqa %xmm9, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm10 ; SSE41-NEXT: movdqa %xmm11, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm7, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm6 -; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm12, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 +; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 +; SSE41-NEXT: movdqa %xmm9, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 +; SSE41-NEXT: movdqa %xmm9, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm7 = [18446744073709551488,18446744073709551488] -; SSE41-NEXT: movapd %xmm1, %xmm5 -; SSE41-NEXT: xorpd %xmm2, %xmm5 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] -; SSE41-NEXT: movapd %xmm5, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movapd %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm9 +; SSE41-NEXT: xorpd %xmm2, %xmm9 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840] +; SSE41-NEXT: movapd %xmm9, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm13, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 +; SSE41-NEXT: movapd %xmm11, %xmm1 ; SSE41-NEXT: xorpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm13, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm7, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; SSE41-NEXT: packssdw %xmm3, %xmm1 -; SSE41-NEXT: movapd %xmm15, %xmm3 -; SSE41-NEXT: xorpd %xmm2, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1 +; SSE41-NEXT: packssdw %xmm9, %xmm1 +; SSE41-NEXT: movapd %xmm12, %xmm9 +; SSE41-NEXT: xorpd %xmm2, %xmm9 +; SSE41-NEXT: movapd %xmm9, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm9 +; SSE41-NEXT: movapd %xmm10, %xmm11 +; SSE41-NEXT: xorpd %xmm2, %xmm11 +; SSE41-NEXT: movapd %xmm11, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] +; SSE41-NEXT: pand %xmm12, %xmm0 +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11 +; SSE41-NEXT: packssdw %xmm9, %xmm11 +; SSE41-NEXT: packssdw %xmm11, %xmm1 +; SSE41-NEXT: movapd %xmm6, %xmm9 +; SSE41-NEXT: xorpd %xmm2, %xmm9 +; SSE41-NEXT: movapd %xmm9, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm9 +; SSE41-NEXT: movapd %xmm5, %xmm6 +; SSE41-NEXT: xorpd %xmm2, %xmm6 +; SSE41-NEXT: movapd %xmm6, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 +; SSE41-NEXT: packssdw %xmm9, %xmm6 ; SSE41-NEXT: movapd %xmm4, %xmm5 ; SSE41-NEXT: xorpd %xmm2, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: movapd %xmm7, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5 -; SSE41-NEXT: packssdw %xmm3, %xmm5 -; SSE41-NEXT: packssdw %xmm5, %xmm1 -; SSE41-NEXT: movapd %xmm14, %xmm3 -; SSE41-NEXT: xorpd %xmm2, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm3 -; SSE41-NEXT: movapd %xmm13, %xmm4 -; SSE41-NEXT: xorpd %xmm2, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm4 -; SSE41-NEXT: packssdw %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm10, %xmm3 -; SSE41-NEXT: xorpd %xmm2, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 -; SSE41-NEXT: xorpd %xmm8, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE41-NEXT: xorpd %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 -; SSE41-NEXT: packssdw %xmm3, %xmm7 -; SSE41-NEXT: packssdw %xmm7, %xmm4 -; SSE41-NEXT: packsswb %xmm4, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE41-NEXT: packssdw %xmm5, %xmm7 +; SSE41-NEXT: packssdw %xmm7, %xmm6 +; SSE41-NEXT: packsswb %xmm6, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -4728,10 +4728,10 @@ ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm8 +; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm9 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm3 @@ -4741,39 +4741,39 @@ ; AVX1-NEXT: vmovdqa (%rdi), %xmm5 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm7, %xmm2, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm5, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm7, %xmm2, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm2, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm8, %xmm2, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm2, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpackssdw %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm6, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm6, %xmm2 -; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm9, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm9, %xmm6, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm8, %xmm6, %xmm3 -; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm8, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm8 +; AVX1-NEXT: vblendvpd %xmm8, %xmm7, %xmm6, %xmm7 +; AVX1-NEXT: vpackssdw %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpackssdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v16i64_v16i8: @@ -5624,82 +5624,82 @@ ; SSE2-NEXT: pand %xmm3, %xmm4 ; SSE2-NEXT: pandn %xmm5, %xmm3 ; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: movd %xmm3, %edx -; SSE2-NEXT: movw %dx, 36(%rdi) -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: movw %cx, 24(%rdi) -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: movw %ax, 12(%rdi) -; SSE2-NEXT: movd %xmm0, %r8d -; SSE2-NEXT: movw %r8w, (%rdi) +; SSE2-NEXT: movd %xmm3, %r8d +; SSE2-NEXT: movw %r8w, 36(%rdi) +; SSE2-NEXT: movd %xmm2, %r11d +; SSE2-NEXT: movw %r11w, 24(%rdi) +; SSE2-NEXT: movd %xmm1, %r14d +; SSE2-NEXT: movw %r14w, 12(%rdi) +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movw %ax, (%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] -; SSE2-NEXT: movd %xmm4, %r9d -; SSE2-NEXT: movw %r9w, 45(%rdi) +; SSE2-NEXT: movd %xmm4, %ecx +; SSE2-NEXT: movw %cx, 45(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; SSE2-NEXT: movd %xmm4, %r10d -; SSE2-NEXT: movw %r10w, 42(%rdi) +; SSE2-NEXT: movd %xmm4, %edx +; SSE2-NEXT: movw %dx, 42(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] -; SSE2-NEXT: movd %xmm3, %r11d -; SSE2-NEXT: movw %r11w, 39(%rdi) -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: movb %dl, 38(%rdi) +; SSE2-NEXT: movd %xmm3, %esi +; SSE2-NEXT: movw %si, 39(%rdi) +; SSE2-NEXT: shrl $16, %r8d +; SSE2-NEXT: movb %r8b, 38(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] -; SSE2-NEXT: movd %xmm3, %r14d -; SSE2-NEXT: movw %r14w, 33(%rdi) +; SSE2-NEXT: movd %xmm3, %r8d +; SSE2-NEXT: movw %r8w, 33(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE2-NEXT: movd %xmm3, %r15d -; SSE2-NEXT: movw %r15w, 30(%rdi) +; SSE2-NEXT: movd %xmm3, %r9d +; SSE2-NEXT: movw %r9w, 30(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE2-NEXT: movd %xmm2, %r12d -; SSE2-NEXT: movw %r12w, 27(%rdi) -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: movb %cl, 26(%rdi) +; SSE2-NEXT: movd %xmm2, %r10d +; SSE2-NEXT: movw %r10w, 27(%rdi) +; SSE2-NEXT: shrl $16, %r11d +; SSE2-NEXT: movb %r11b, 26(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE2-NEXT: movd %xmm2, %esi -; SSE2-NEXT: movw %si, 21(%rdi) +; SSE2-NEXT: movd %xmm2, %r11d +; SSE2-NEXT: movw %r11w, 21(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm2, %ebx ; SSE2-NEXT: movw %bx, 18(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE2-NEXT: movd %xmm1, %ebp ; SSE2-NEXT: movw %bp, 15(%rdi) -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 14(%rdi) +; SSE2-NEXT: shrl $16, %r14d +; SSE2-NEXT: movb %r14b, 14(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: movw %ax, 9(%rdi) +; SSE2-NEXT: movd %xmm1, %r14d +; SSE2-NEXT: movw %r14w, 9(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: movw %cx, 6(%rdi) +; SSE2-NEXT: movd %xmm1, %r15d +; SSE2-NEXT: movw %r15w, 6(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: movd %xmm0, %edx -; SSE2-NEXT: movw %dx, 3(%rdi) +; SSE2-NEXT: movd %xmm0, %r12d +; SSE2-NEXT: movw %r12w, 3(%rdi) +; SSE2-NEXT: shrl $16, %eax +; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 47(%rdi) +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 44(%rdi) +; SSE2-NEXT: shrl $16, %esi +; SSE2-NEXT: movb %sil, 41(%rdi) ; SSE2-NEXT: shrl $16, %r8d -; SSE2-NEXT: movb %r8b, 2(%rdi) +; SSE2-NEXT: movb %r8b, 35(%rdi) ; SSE2-NEXT: shrl $16, %r9d -; SSE2-NEXT: movb %r9b, 47(%rdi) +; SSE2-NEXT: movb %r9b, 32(%rdi) ; SSE2-NEXT: shrl $16, %r10d -; SSE2-NEXT: movb %r10b, 44(%rdi) +; SSE2-NEXT: movb %r10b, 29(%rdi) ; SSE2-NEXT: shrl $16, %r11d -; SSE2-NEXT: movb %r11b, 41(%rdi) -; SSE2-NEXT: shrl $16, %r14d -; SSE2-NEXT: movb %r14b, 35(%rdi) -; SSE2-NEXT: shrl $16, %r15d -; SSE2-NEXT: movb %r15b, 32(%rdi) -; SSE2-NEXT: shrl $16, %r12d -; SSE2-NEXT: movb %r12b, 29(%rdi) -; SSE2-NEXT: shrl $16, %esi -; SSE2-NEXT: movb %sil, 23(%rdi) +; SSE2-NEXT: movb %r11b, 23(%rdi) ; SSE2-NEXT: shrl $16, %ebx ; SSE2-NEXT: movb %bl, 20(%rdi) ; SSE2-NEXT: shrl $16, %ebp ; SSE2-NEXT: movb %bpl, 17(%rdi) -; SSE2-NEXT: shrl $16, %eax -; SSE2-NEXT: movb %al, 11(%rdi) -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: movb %cl, 8(%rdi) -; SSE2-NEXT: shrl $16, %edx -; SSE2-NEXT: movb %dl, 5(%rdi) +; SSE2-NEXT: shrl $16, %r14d +; SSE2-NEXT: movb %r14b, 11(%rdi) +; SSE2-NEXT: shrl $16, %r15d +; SSE2-NEXT: movb %r15b, 8(%rdi) +; SSE2-NEXT: shrl $16, %r12d +; SSE2-NEXT: movb %r12b, 5(%rdi) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r14 @@ -5756,82 +5756,82 @@ ; SSSE3-NEXT: pand %xmm3, %xmm4 ; SSSE3-NEXT: pandn %xmm5, %xmm3 ; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: movd %xmm3, %edx -; SSSE3-NEXT: movw %dx, 36(%rdi) -; SSSE3-NEXT: movd %xmm2, %ecx -; SSSE3-NEXT: movw %cx, 24(%rdi) -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: movw %ax, 12(%rdi) -; SSSE3-NEXT: movd %xmm0, %r8d -; SSSE3-NEXT: movw %r8w, (%rdi) +; SSSE3-NEXT: movd %xmm3, %r8d +; SSSE3-NEXT: movw %r8w, 36(%rdi) +; SSSE3-NEXT: movd %xmm2, %r11d +; SSSE3-NEXT: movw %r11w, 24(%rdi) +; SSSE3-NEXT: movd %xmm1, %r14d +; SSSE3-NEXT: movw %r14w, 12(%rdi) +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: movw %ax, (%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] -; SSSE3-NEXT: movd %xmm4, %r9d -; SSSE3-NEXT: movw %r9w, 45(%rdi) +; SSSE3-NEXT: movd %xmm4, %ecx +; SSSE3-NEXT: movw %cx, 45(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; SSSE3-NEXT: movd %xmm4, %r10d -; SSSE3-NEXT: movw %r10w, 42(%rdi) +; SSSE3-NEXT: movd %xmm4, %edx +; SSSE3-NEXT: movw %dx, 42(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] -; SSSE3-NEXT: movd %xmm3, %r11d -; SSSE3-NEXT: movw %r11w, 39(%rdi) -; SSSE3-NEXT: shrl $16, %edx -; SSSE3-NEXT: movb %dl, 38(%rdi) +; SSSE3-NEXT: movd %xmm3, %esi +; SSSE3-NEXT: movw %si, 39(%rdi) +; SSSE3-NEXT: shrl $16, %r8d +; SSSE3-NEXT: movb %r8b, 38(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] -; SSSE3-NEXT: movd %xmm3, %r14d -; SSSE3-NEXT: movw %r14w, 33(%rdi) +; SSSE3-NEXT: movd %xmm3, %r8d +; SSSE3-NEXT: movw %r8w, 33(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSSE3-NEXT: movd %xmm3, %r15d -; SSSE3-NEXT: movw %r15w, 30(%rdi) +; SSSE3-NEXT: movd %xmm3, %r9d +; SSSE3-NEXT: movw %r9w, 30(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSSE3-NEXT: movd %xmm2, %r12d -; SSSE3-NEXT: movw %r12w, 27(%rdi) -; SSSE3-NEXT: shrl $16, %ecx -; SSSE3-NEXT: movb %cl, 26(%rdi) +; SSSE3-NEXT: movd %xmm2, %r10d +; SSSE3-NEXT: movw %r10w, 27(%rdi) +; SSSE3-NEXT: shrl $16, %r11d +; SSSE3-NEXT: movb %r11b, 26(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSSE3-NEXT: movd %xmm2, %esi -; SSSE3-NEXT: movw %si, 21(%rdi) +; SSSE3-NEXT: movd %xmm2, %r11d +; SSSE3-NEXT: movw %r11w, 21(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSSE3-NEXT: movd %xmm2, %ebx ; SSSE3-NEXT: movw %bx, 18(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSSE3-NEXT: movd %xmm1, %ebp ; SSSE3-NEXT: movw %bp, 15(%rdi) -; SSSE3-NEXT: shrl $16, %eax -; SSSE3-NEXT: movb %al, 14(%rdi) +; SSSE3-NEXT: shrl $16, %r14d +; SSSE3-NEXT: movb %r14b, 14(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: movw %ax, 9(%rdi) +; SSSE3-NEXT: movd %xmm1, %r14d +; SSSE3-NEXT: movw %r14w, 9(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSSE3-NEXT: movd %xmm1, %ecx -; SSSE3-NEXT: movw %cx, 6(%rdi) +; SSSE3-NEXT: movd %xmm1, %r15d +; SSSE3-NEXT: movw %r15w, 6(%rdi) ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSSE3-NEXT: movd %xmm0, %edx -; SSSE3-NEXT: movw %dx, 3(%rdi) +; SSSE3-NEXT: movd %xmm0, %r12d +; SSSE3-NEXT: movw %r12w, 3(%rdi) +; SSSE3-NEXT: shrl $16, %eax +; SSSE3-NEXT: movb %al, 2(%rdi) +; SSSE3-NEXT: shrl $16, %ecx +; SSSE3-NEXT: movb %cl, 47(%rdi) +; SSSE3-NEXT: shrl $16, %edx +; SSSE3-NEXT: movb %dl, 44(%rdi) +; SSSE3-NEXT: shrl $16, %esi +; SSSE3-NEXT: movb %sil, 41(%rdi) ; SSSE3-NEXT: shrl $16, %r8d -; SSSE3-NEXT: movb %r8b, 2(%rdi) +; SSSE3-NEXT: movb %r8b, 35(%rdi) ; SSSE3-NEXT: shrl $16, %r9d -; SSSE3-NEXT: movb %r9b, 47(%rdi) +; SSSE3-NEXT: movb %r9b, 32(%rdi) ; SSSE3-NEXT: shrl $16, %r10d -; SSSE3-NEXT: movb %r10b, 44(%rdi) +; SSSE3-NEXT: movb %r10b, 29(%rdi) ; SSSE3-NEXT: shrl $16, %r11d -; SSSE3-NEXT: movb %r11b, 41(%rdi) -; SSSE3-NEXT: shrl $16, %r14d -; SSSE3-NEXT: movb %r14b, 35(%rdi) -; SSSE3-NEXT: shrl $16, %r15d -; SSSE3-NEXT: movb %r15b, 32(%rdi) -; SSSE3-NEXT: shrl $16, %r12d -; SSSE3-NEXT: movb %r12b, 29(%rdi) -; SSSE3-NEXT: shrl $16, %esi -; SSSE3-NEXT: movb %sil, 23(%rdi) +; SSSE3-NEXT: movb %r11b, 23(%rdi) ; SSSE3-NEXT: shrl $16, %ebx ; SSSE3-NEXT: movb %bl, 20(%rdi) ; SSSE3-NEXT: shrl $16, %ebp ; SSSE3-NEXT: movb %bpl, 17(%rdi) -; SSSE3-NEXT: shrl $16, %eax -; SSSE3-NEXT: movb %al, 11(%rdi) -; SSSE3-NEXT: shrl $16, %ecx -; SSSE3-NEXT: movb %cl, 8(%rdi) -; SSSE3-NEXT: shrl $16, %edx -; SSSE3-NEXT: movb %dl, 5(%rdi) +; SSSE3-NEXT: shrl $16, %r14d +; SSSE3-NEXT: movb %r14b, 11(%rdi) +; SSSE3-NEXT: shrl $16, %r15d +; SSSE3-NEXT: movb %r15b, 8(%rdi) +; SSSE3-NEXT: shrl $16, %r12d +; SSSE3-NEXT: movb %r12b, 5(%rdi) ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r14 @@ -6084,72 +6084,72 @@ ; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512-NEXT: vpextrd $3, %xmm1, %ecx -; AVX512-NEXT: movw %cx, 45(%rdi) -; AVX512-NEXT: vpextrd $2, %xmm1, %eax -; AVX512-NEXT: movw %ax, 42(%rdi) +; AVX512-NEXT: vpextrd $3, %xmm1, %r15d +; AVX512-NEXT: movw %r15w, 45(%rdi) +; AVX512-NEXT: vpextrd $2, %xmm1, %r14d +; AVX512-NEXT: movw %r14w, 42(%rdi) ; AVX512-NEXT: vpextrd $1, %xmm1, %ebp ; AVX512-NEXT: movw %bp, 39(%rdi) -; AVX512-NEXT: vmovd %xmm1, %esi -; AVX512-NEXT: movw %si, 36(%rdi) +; AVX512-NEXT: vmovd %xmm1, %r11d +; AVX512-NEXT: movw %r11w, 36(%rdi) ; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512-NEXT: vpextrd $3, %xmm1, %ebx ; AVX512-NEXT: movw %bx, 33(%rdi) -; AVX512-NEXT: vpextrd $2, %xmm1, %edx -; AVX512-NEXT: movw %dx, 30(%rdi) -; AVX512-NEXT: vpextrd $1, %xmm1, %r15d -; AVX512-NEXT: movw %r15w, 27(%rdi) -; AVX512-NEXT: vmovd %xmm1, %r14d -; AVX512-NEXT: movw %r14w, 24(%rdi) -; AVX512-NEXT: vpextrd $3, %xmm0, %r11d -; AVX512-NEXT: movw %r11w, 9(%rdi) -; AVX512-NEXT: vpextrd $2, %xmm0, %r10d -; AVX512-NEXT: movw %r10w, 6(%rdi) -; AVX512-NEXT: vpextrd $1, %xmm0, %r9d -; AVX512-NEXT: movw %r9w, 3(%rdi) -; AVX512-NEXT: vmovd %xmm0, %r8d -; AVX512-NEXT: movw %r8w, (%rdi) -; AVX512-NEXT: shrl $16, %ecx -; AVX512-NEXT: movb %cl, 47(%rdi) -; AVX512-NEXT: shrl $16, %eax -; AVX512-NEXT: movb %al, 44(%rdi) -; AVX512-NEXT: shrl $16, %ebp -; AVX512-NEXT: movb %bpl, 41(%rdi) -; AVX512-NEXT: shrl $16, %esi -; AVX512-NEXT: movb %sil, 38(%rdi) -; AVX512-NEXT: shrl $16, %ebx -; AVX512-NEXT: movb %bl, 35(%rdi) -; AVX512-NEXT: shrl $16, %edx -; AVX512-NEXT: movb %dl, 32(%rdi) -; AVX512-NEXT: shrl $16, %r15d -; AVX512-NEXT: movb %r15b, 29(%rdi) -; AVX512-NEXT: shrl $16, %r14d -; AVX512-NEXT: movb %r14b, 26(%rdi) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpextrd $2, %xmm1, %r10d +; AVX512-NEXT: movw %r10w, 30(%rdi) +; AVX512-NEXT: vpextrd $1, %xmm1, %r9d +; AVX512-NEXT: movw %r9w, 27(%rdi) +; AVX512-NEXT: vmovd %xmm1, %r8d +; AVX512-NEXT: movw %r8w, 24(%rdi) ; AVX512-NEXT: vpextrd $3, %xmm0, %esi -; AVX512-NEXT: movw %si, 21(%rdi) +; AVX512-NEXT: movw %si, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm0, %edx -; AVX512-NEXT: movw %dx, 18(%rdi) +; AVX512-NEXT: movw %dx, 6(%rdi) ; AVX512-NEXT: vpextrd $1, %xmm0, %ecx -; AVX512-NEXT: movw %cx, 15(%rdi) +; AVX512-NEXT: movw %cx, 3(%rdi) ; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: movw %ax, 12(%rdi) +; AVX512-NEXT: movw %ax, (%rdi) +; AVX512-NEXT: shrl $16, %r15d +; AVX512-NEXT: movb %r15b, 47(%rdi) +; AVX512-NEXT: shrl $16, %r14d +; AVX512-NEXT: movb %r14b, 44(%rdi) +; AVX512-NEXT: shrl $16, %ebp +; AVX512-NEXT: movb %bpl, 41(%rdi) ; AVX512-NEXT: shrl $16, %r11d -; AVX512-NEXT: movb %r11b, 11(%rdi) +; AVX512-NEXT: movb %r11b, 38(%rdi) +; AVX512-NEXT: shrl $16, %ebx +; AVX512-NEXT: movb %bl, 35(%rdi) ; AVX512-NEXT: shrl $16, %r10d -; AVX512-NEXT: movb %r10b, 8(%rdi) +; AVX512-NEXT: movb %r10b, 32(%rdi) ; AVX512-NEXT: shrl $16, %r9d -; AVX512-NEXT: movb %r9b, 5(%rdi) +; AVX512-NEXT: movb %r9b, 29(%rdi) ; AVX512-NEXT: shrl $16, %r8d -; AVX512-NEXT: movb %r8b, 2(%rdi) +; AVX512-NEXT: movb %r8b, 26(%rdi) +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpextrd $3, %xmm0, %r11d +; AVX512-NEXT: movw %r11w, 21(%rdi) +; AVX512-NEXT: vpextrd $2, %xmm0, %r10d +; AVX512-NEXT: movw %r10w, 18(%rdi) +; AVX512-NEXT: vpextrd $1, %xmm0, %r9d +; AVX512-NEXT: movw %r9w, 15(%rdi) +; AVX512-NEXT: vmovd %xmm0, %r8d +; AVX512-NEXT: movw %r8w, 12(%rdi) ; AVX512-NEXT: shrl $16, %esi -; AVX512-NEXT: movb %sil, 23(%rdi) +; AVX512-NEXT: movb %sil, 11(%rdi) ; AVX512-NEXT: shrl $16, %edx -; AVX512-NEXT: movb %dl, 20(%rdi) +; AVX512-NEXT: movb %dl, 8(%rdi) ; AVX512-NEXT: shrl $16, %ecx -; AVX512-NEXT: movb %cl, 17(%rdi) +; AVX512-NEXT: movb %cl, 5(%rdi) ; AVX512-NEXT: shrl $16, %eax -; AVX512-NEXT: movb %al, 14(%rdi) +; AVX512-NEXT: movb %al, 2(%rdi) +; AVX512-NEXT: shrl $16, %r11d +; AVX512-NEXT: movb %r11b, 23(%rdi) +; AVX512-NEXT: shrl $16, %r10d +; AVX512-NEXT: movb %r10b, 20(%rdi) +; AVX512-NEXT: shrl $16, %r9d +; AVX512-NEXT: movb %r9b, 17(%rdi) +; AVX512-NEXT: shrl $16, %r8d +; AVX512-NEXT: movb %r8b, 14(%rdi) ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r14 ; AVX512-NEXT: popq %r15 @@ -6166,72 +6166,72 @@ ; SKX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; SKX-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; SKX-NEXT: vpextrd $3, %xmm1, %ecx -; SKX-NEXT: movw %cx, 45(%rdi) -; SKX-NEXT: vpextrd $2, %xmm1, %eax -; SKX-NEXT: movw %ax, 42(%rdi) +; SKX-NEXT: vpextrd $3, %xmm1, %r15d +; SKX-NEXT: movw %r15w, 45(%rdi) +; SKX-NEXT: vpextrd $2, %xmm1, %r14d +; SKX-NEXT: movw %r14w, 42(%rdi) ; SKX-NEXT: vpextrd $1, %xmm1, %ebp ; SKX-NEXT: movw %bp, 39(%rdi) -; SKX-NEXT: vmovd %xmm1, %esi -; SKX-NEXT: movw %si, 36(%rdi) +; SKX-NEXT: vmovd %xmm1, %r11d +; SKX-NEXT: movw %r11w, 36(%rdi) ; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; SKX-NEXT: vpextrd $3, %xmm1, %ebx ; SKX-NEXT: movw %bx, 33(%rdi) -; SKX-NEXT: vpextrd $2, %xmm1, %edx -; SKX-NEXT: movw %dx, 30(%rdi) -; SKX-NEXT: vpextrd $1, %xmm1, %r15d -; SKX-NEXT: movw %r15w, 27(%rdi) -; SKX-NEXT: vmovd %xmm1, %r14d -; SKX-NEXT: vpextrd $3, %xmm0, %r11d -; SKX-NEXT: movw %r14w, 24(%rdi) -; SKX-NEXT: movw %r11w, 9(%rdi) -; SKX-NEXT: vpextrd $2, %xmm0, %r10d -; SKX-NEXT: vpextrd $1, %xmm0, %r9d -; SKX-NEXT: movw %r10w, 6(%rdi) -; SKX-NEXT: movw %r9w, 3(%rdi) -; SKX-NEXT: vmovd %xmm0, %r8d -; SKX-NEXT: movw %r8w, (%rdi) -; SKX-NEXT: shrl $16, %ecx -; SKX-NEXT: movb %cl, 47(%rdi) -; SKX-NEXT: shrl $16, %eax -; SKX-NEXT: movb %al, 44(%rdi) -; SKX-NEXT: shrl $16, %ebp -; SKX-NEXT: movb %bpl, 41(%rdi) -; SKX-NEXT: shrl $16, %esi -; SKX-NEXT: movb %sil, 38(%rdi) -; SKX-NEXT: shrl $16, %ebx -; SKX-NEXT: movb %bl, 35(%rdi) -; SKX-NEXT: shrl $16, %edx -; SKX-NEXT: movb %dl, 32(%rdi) -; SKX-NEXT: shrl $16, %r15d -; SKX-NEXT: movb %r15b, 29(%rdi) -; SKX-NEXT: shrl $16, %r14d -; SKX-NEXT: movb %r14b, 26(%rdi) -; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 +; SKX-NEXT: vpextrd $2, %xmm1, %r10d +; SKX-NEXT: movw %r10w, 30(%rdi) +; SKX-NEXT: vpextrd $1, %xmm1, %r9d +; SKX-NEXT: movw %r9w, 27(%rdi) +; SKX-NEXT: vmovd %xmm1, %r8d ; SKX-NEXT: vpextrd $3, %xmm0, %esi -; SKX-NEXT: movw %si, 21(%rdi) +; SKX-NEXT: movw %r8w, 24(%rdi) +; SKX-NEXT: movw %si, 9(%rdi) ; SKX-NEXT: vpextrd $2, %xmm0, %edx -; SKX-NEXT: movw %dx, 18(%rdi) ; SKX-NEXT: vpextrd $1, %xmm0, %ecx -; SKX-NEXT: movw %cx, 15(%rdi) +; SKX-NEXT: movw %dx, 6(%rdi) +; SKX-NEXT: movw %cx, 3(%rdi) ; SKX-NEXT: vmovd %xmm0, %eax -; SKX-NEXT: movw %ax, 12(%rdi) +; SKX-NEXT: movw %ax, (%rdi) +; SKX-NEXT: shrl $16, %r15d +; SKX-NEXT: movb %r15b, 47(%rdi) +; SKX-NEXT: shrl $16, %r14d +; SKX-NEXT: movb %r14b, 44(%rdi) +; SKX-NEXT: shrl $16, %ebp +; SKX-NEXT: movb %bpl, 41(%rdi) ; SKX-NEXT: shrl $16, %r11d -; SKX-NEXT: movb %r11b, 11(%rdi) +; SKX-NEXT: movb %r11b, 38(%rdi) +; SKX-NEXT: shrl $16, %ebx +; SKX-NEXT: movb %bl, 35(%rdi) ; SKX-NEXT: shrl $16, %r10d -; SKX-NEXT: movb %r10b, 8(%rdi) +; SKX-NEXT: movb %r10b, 32(%rdi) ; SKX-NEXT: shrl $16, %r9d -; SKX-NEXT: movb %r9b, 5(%rdi) +; SKX-NEXT: movb %r9b, 29(%rdi) ; SKX-NEXT: shrl $16, %r8d -; SKX-NEXT: movb %r8b, 2(%rdi) +; SKX-NEXT: movb %r8b, 26(%rdi) +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 +; SKX-NEXT: vpextrd $3, %xmm0, %r11d +; SKX-NEXT: movw %r11w, 21(%rdi) +; SKX-NEXT: vpextrd $2, %xmm0, %r10d +; SKX-NEXT: movw %r10w, 18(%rdi) +; SKX-NEXT: vpextrd $1, %xmm0, %r9d +; SKX-NEXT: movw %r9w, 15(%rdi) +; SKX-NEXT: vmovd %xmm0, %r8d +; SKX-NEXT: movw %r8w, 12(%rdi) ; SKX-NEXT: shrl $16, %esi -; SKX-NEXT: movb %sil, 23(%rdi) +; SKX-NEXT: movb %sil, 11(%rdi) ; SKX-NEXT: shrl $16, %edx -; SKX-NEXT: movb %dl, 20(%rdi) +; SKX-NEXT: movb %dl, 8(%rdi) ; SKX-NEXT: shrl $16, %ecx -; SKX-NEXT: movb %cl, 17(%rdi) +; SKX-NEXT: movb %cl, 5(%rdi) ; SKX-NEXT: shrl $16, %eax -; SKX-NEXT: movb %al, 14(%rdi) +; SKX-NEXT: movb %al, 2(%rdi) +; SKX-NEXT: shrl $16, %r11d +; SKX-NEXT: movb %r11b, 23(%rdi) +; SKX-NEXT: shrl $16, %r10d +; SKX-NEXT: movb %r10b, 20(%rdi) +; SKX-NEXT: shrl $16, %r9d +; SKX-NEXT: movb %r9b, 17(%rdi) +; SKX-NEXT: shrl $16, %r8d +; SKX-NEXT: movb %r8b, 14(%rdi) ; SKX-NEXT: popq %rbx ; SKX-NEXT: popq %r14 ; SKX-NEXT: popq %r15 diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -433,184 +433,184 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) { ; SSE2-LABEL: trunc_usat_v8i64_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm9 +; SSE2-NEXT: movdqa (%rdi), %xmm2 ; SSE2-NEXT: movdqa 16(%rdi), %xmm5 ; SSE2-NEXT: movdqa 32(%rdi), %xmm6 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm8 +; SSE2-NEXT: por %xmm1, %xmm8 ; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm6 -; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm7 +; SSE2-NEXT: por %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm9, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm3, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v8i64_v8i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm9 +; SSSE3-NEXT: movdqa (%rdi), %xmm2 ; SSSE3-NEXT: movdqa 16(%rdi), %xmm5 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm6 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: movdqa %xmm1, %xmm7 -; SSSE3-NEXT: pxor %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pxor %xmm0, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSSE3-NEXT: movdqa %xmm4, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm1 +; SSSE3-NEXT: pandn %xmm3, %xmm8 +; SSSE3-NEXT: por %xmm1, %xmm8 ; SSSE3-NEXT: movdqa %xmm6, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm7 +; SSSE3-NEXT: movdqa %xmm4, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSSE3-NEXT: pand %xmm7, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: pandn %xmm3, %xmm1 ; SSSE3-NEXT: por %xmm6, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSSE3-NEXT: movdqa %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm7 +; SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2] +; SSSE3-NEXT: pand %xmm7, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm5 +; SSSE3-NEXT: pandn %xmm3, %xmm7 +; SSSE3-NEXT: por %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pand %xmm6, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm9 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm9, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pandn %xmm3, %xmm0 +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i64_v8i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 -; SSE41-NEXT: movdqa 16(%rdi), %xmm9 +; SSE41-NEXT: movdqa (%rdi), %xmm3 +; SSE41-NEXT: movdqa 16(%rdi), %xmm6 ; SSE41-NEXT: movdqa 32(%rdi), %xmm7 ; SSE41-NEXT: movdqa 48(%rdi), %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm4, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm4, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: movdqa %xmm4, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm2, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] -; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4 -; SSE41-NEXT: pxor %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm4, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2] ; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -1329,66 +1329,66 @@ ; SSE2-LABEL: trunc_usat_v8i64_v8i16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm6 ; SSE2-NEXT: movdqa 48(%rdi), %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002324991,9223372039002324991] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm6 -; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: pxor %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm8 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pandn %xmm8, %xmm6 +; SSE2-NEXT: pandn %xmm1, %xmm6 ; SSE2-NEXT: por %xmm7, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pxor %xmm3, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm8 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm8 +; SSE2-NEXT: por %xmm4, %xmm8 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] @@ -1402,66 +1402,66 @@ ; SSSE3-LABEL: trunc_usat_v8i64_v8i16: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa (%rdi), %xmm4 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm9 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm6 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: movdqa %xmm6, %xmm2 -; SSSE3-NEXT: pxor %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002324991,9223372039002324991] -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: pandn %xmm1, %xmm2 ; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm7, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm6 +; SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSSE3-NEXT: pxor %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm8 +; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm6 ; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pandn %xmm8, %xmm6 +; SSSE3-NEXT: pandn %xmm1, %xmm6 ; SSSE3-NEXT: por %xmm7, %xmm6 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm9 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: movdqa %xmm4, %xmm7 +; SSSE3-NEXT: pxor %xmm3, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm8 +; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: pandn %xmm1, %xmm8 +; SSSE3-NEXT: por %xmm4, %xmm8 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] +; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pandn %xmm1, %xmm4 +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] @@ -1476,54 +1476,54 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm7 ; SSE41-NEXT: movdqa 16(%rdi), %xmm1 -; SSE41-NEXT: movdqa 32(%rdi), %xmm8 -; SSE41-NEXT: movdqa 48(%rdi), %xmm9 +; SSE41-NEXT: movdqa 32(%rdi), %xmm3 +; SSE41-NEXT: movdqa 48(%rdi), %xmm6 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm4, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm4, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: movdqa %xmm4, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm2, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; SSE41-NEXT: packusdw %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: packusdw %xmm8, %xmm1 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4 -; SSE41-NEXT: pxor %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm4, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm4, %xmm2 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: packusdw %xmm7, %xmm2 ; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -1882,9 +1882,9 @@ ; SSE2-LABEL: trunc_usat_v16i32_v16i16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm5 -; SSE2-NEXT: movdqa 16(%rdi), %xmm8 +; SSE2-NEXT: movdqa 16(%rdi), %xmm4 ; SSE2-NEXT: movdqa 32(%rdi), %xmm0 -; SSE2-NEXT: movdqa 48(%rdi), %xmm4 +; SSE2-NEXT: movdqa 48(%rdi), %xmm8 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm6, %xmm3 @@ -1895,24 +1895,24 @@ ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm8, %xmm0 ; SSE2-NEXT: pxor %xmm6, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm8 ; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm6, %xmm4 +; SSE2-NEXT: por %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pxor %xmm6, %xmm8 ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm7, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 ; SSE2-NEXT: pxor %xmm2, %xmm7 -; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm2 ; SSE2-NEXT: por %xmm7, %xmm2 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 @@ -1929,9 +1929,9 @@ ; SSSE3-LABEL: trunc_usat_v16i32_v16i16: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa (%rdi), %xmm5 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm8 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm4 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm0 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm4 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm8 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm6, %xmm3 @@ -1942,24 +1942,24 @@ ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: pxor %xmm7, %xmm1 ; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm8, %xmm0 ; SSSE3-NEXT: pxor %xmm6, %xmm0 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm3, %xmm8 ; SSSE3-NEXT: pxor %xmm7, %xmm3 -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm6, %xmm4 +; SSSE3-NEXT: por %xmm8, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pxor %xmm6, %xmm8 ; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm0 ; SSSE3-NEXT: pand %xmm0, %xmm5 ; SSSE3-NEXT: pxor %xmm7, %xmm0 ; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 ; SSSE3-NEXT: pxor %xmm2, %xmm7 -; SSSE3-NEXT: pand %xmm8, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm2 ; SSSE3-NEXT: por %xmm7, %xmm2 ; SSSE3-NEXT: pslld $16, %xmm2 ; SSSE3-NEXT: psrad $16, %xmm2 @@ -2422,7 +2422,7 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE2-LABEL: trunc_usat_v4i64_v4i8_store: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm3 @@ -2431,28 +2431,28 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] ; SSE2-NEXT: movdqa %xmm6, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pand %xmm5, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: packuswb %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: packuswb %xmm4, %xmm3 ; SSE2-NEXT: packuswb %xmm3, %xmm3 ; SSE2-NEXT: packuswb %xmm3, %xmm3 ; SSE2-NEXT: movd %xmm3, (%rdi) @@ -2460,7 +2460,7 @@ ; ; SSSE3-LABEL: trunc_usat_v4i64_v4i8_store: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255] ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm3 @@ -2469,29 +2469,29 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] ; SSSE3-NEXT: movdqa %xmm6, %xmm7 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSSE3-NEXT: pand %xmm5, %xmm8 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: por %xmm8, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: pandn %xmm2, %xmm3 ; SSSE3-NEXT: por %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm4 ; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSSE3-NEXT: movd %xmm3, (%rdi) ; SSSE3-NEXT: retq ; @@ -2608,65 +2608,65 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm6 ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm9 +; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pxor %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259711,9223372039002259711] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm8 +; SSE2-NEXT: por %xmm0, %xmm8 ; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm0 ; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm9 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm9, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm7 +; SSE2-NEXT: por %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: packuswb %xmm7, %xmm4 +; SSE2-NEXT: packuswb %xmm4, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; @@ -2674,65 +2674,65 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa (%rdi), %xmm6 ; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm9 +; SSSE3-NEXT: movdqa 32(%rdi), %xmm1 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: movdqa %xmm0, %xmm7 -; SSSE3-NEXT: pxor %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259711,9223372039002259711] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm4, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pandn %xmm2, %xmm8 +; SSSE3-NEXT: por %xmm0, %xmm8 ; SSSE3-NEXT: movdqa %xmm6, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 -; SSSE3-NEXT: movdqa %xmm1, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm7 +; SSSE3-NEXT: movdqa %xmm4, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSSE3-NEXT: pand %xmm7, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm0 ; SSSE3-NEXT: pand %xmm0, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: pandn %xmm2, %xmm0 ; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: packuswb %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm2 -; SSSE3-NEXT: pxor %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm9 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm9, %xmm1 -; SSSE3-NEXT: packuswb %xmm3, %xmm1 -; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: packuswb %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm7 +; SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2] +; SSSE3-NEXT: pand %xmm7, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm5 +; SSSE3-NEXT: pandn %xmm2, %xmm7 +; SSSE3-NEXT: por %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSSE3-NEXT: pxor %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: packuswb %xmm7, %xmm4 +; SSSE3-NEXT: packuswb %xmm4, %xmm0 ; SSSE3-NEXT: packuswb %xmm0, %xmm0 ; SSSE3-NEXT: retq ; @@ -2740,54 +2740,54 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm7 ; SSE41-NEXT: movdqa 16(%rdi), %xmm1 -; SSE41-NEXT: movdqa 32(%rdi), %xmm8 -; SSE41-NEXT: movdqa 48(%rdi), %xmm9 +; SSE41-NEXT: movdqa 32(%rdi), %xmm3 +; SSE41-NEXT: movdqa 48(%rdi), %xmm6 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm4, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm4, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: movdqa %xmm4, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm2, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; SSE41-NEXT: packusdw %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: packusdw %xmm8, %xmm1 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4 -; SSE41-NEXT: pxor %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm4, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm4, %xmm2 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: packusdw %xmm7, %xmm2 ; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: packuswb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -2866,65 +2866,65 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm6 ; SSE2-NEXT: movdqa 16(%rdi), %xmm5 -; SSE2-NEXT: movdqa 32(%rdi), %xmm9 +; SSE2-NEXT: movdqa 32(%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm5, %xmm7 ; SSE2-NEXT: pxor %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259711,9223372039002259711] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm3, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm8 +; SSE2-NEXT: por %xmm5, %xmm8 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: pxor %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm5 ; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: packuswb %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm9, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm5 +; SSE2-NEXT: packuswb %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2] +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm7 +; SSE2-NEXT: por %xmm4, %xmm7 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: packuswb %xmm7, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm5 ; SSE2-NEXT: packuswb %xmm5, %xmm5 ; SSE2-NEXT: movq %xmm5, (%rsi) ; SSE2-NEXT: retq @@ -2933,65 +2933,65 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa (%rdi), %xmm6 ; SSSE3-NEXT: movdqa 16(%rdi), %xmm5 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm9 +; SSSE3-NEXT: movdqa 32(%rdi), %xmm0 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: movdqa %xmm5, %xmm7 ; SSSE3-NEXT: pxor %xmm2, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259711,9223372039002259711] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm1, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm6, %xmm3 -; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm0, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm3, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pandn %xmm1, %xmm8 +; SSSE3-NEXT: por %xmm5, %xmm8 +; SSSE3-NEXT: movdqa %xmm6, %xmm5 +; SSSE3-NEXT: pxor %xmm2, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm7 +; SSSE3-NEXT: movdqa %xmm3, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSSE3-NEXT: pand %xmm7, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: pandn %xmm1, %xmm5 ; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: packuswb %xmm1, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm9 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm9, %xmm0 -; SSSE3-NEXT: packuswb %xmm3, %xmm0 -; SSSE3-NEXT: packuswb %xmm0, %xmm5 +; SSSE3-NEXT: packuswb %xmm8, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pxor %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm7 +; SSSE3-NEXT: movdqa %xmm3, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,0,2,2] +; SSSE3-NEXT: pand %xmm7, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pandn %xmm1, %xmm7 +; SSSE3-NEXT: por %xmm4, %xmm7 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm1, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: packuswb %xmm7, %xmm3 +; SSSE3-NEXT: packuswb %xmm3, %xmm5 ; SSSE3-NEXT: packuswb %xmm5, %xmm5 ; SSSE3-NEXT: movq %xmm5, (%rsi) ; SSSE3-NEXT: retq @@ -3000,54 +3000,54 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm7 ; SSE41-NEXT: movdqa 16(%rdi), %xmm6 -; SSE41-NEXT: movdqa 32(%rdi), %xmm8 -; SSE41-NEXT: movdqa 48(%rdi), %xmm9 +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm5 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm3, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm3, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm3, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6 -; SSE41-NEXT: packusdw %xmm3, %xmm6 -; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: packusdw %xmm8, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3 -; SSE41-NEXT: pxor %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm3, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm3, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 +; SSE41-NEXT: pxor %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: packusdw %xmm3, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: packusdw %xmm7, %xmm1 ; SSE41-NEXT: packusdw %xmm1, %xmm6 ; SSE41-NEXT: packuswb %xmm6, %xmm6 ; SSE41-NEXT: movq %xmm6, (%rsi) @@ -3127,359 +3127,359 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE2-LABEL: trunc_usat_v16i64_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa 96(%rdi), %xmm9 -; SSE2-NEXT: movdqa 112(%rdi), %xmm10 -; SSE2-NEXT: movdqa 64(%rdi), %xmm11 -; SSE2-NEXT: movdqa 80(%rdi), %xmm12 -; SSE2-NEXT: movdqa (%rdi), %xmm3 -; SSE2-NEXT: movdqa 16(%rdi), %xmm6 -; SSE2-NEXT: movdqa 32(%rdi), %xmm13 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm14, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm14, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm6 -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: movdqa 96(%rdi), %xmm1 +; SSE2-NEXT: movdqa 112(%rdi), %xmm3 +; SSE2-NEXT: movdqa 64(%rdi), %xmm6 +; SSE2-NEXT: movdqa 80(%rdi), %xmm7 +; SSE2-NEXT: movdqa (%rdi), %xmm10 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa 32(%rdi), %xmm8 +; SSE2-NEXT: movdqa 48(%rdi), %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: pxor %xmm4, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm5, %xmm13 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,0,2,2] +; SSE2-NEXT: pand %xmm12, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm12 +; SSE2-NEXT: pand %xmm12, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm12 +; SSE2-NEXT: por %xmm0, %xmm12 +; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm11 +; SSE2-NEXT: movdqa %xmm5, %xmm13 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm10 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 +; SSE2-NEXT: packuswb %xmm12, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pxor %xmm4, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm11 +; SSE2-NEXT: movdqa %xmm5, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm9 +; SSE2-NEXT: pandn %xmm2, %xmm11 +; SSE2-NEXT: por %xmm9, %xmm11 +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: pxor %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm10 +; SSE2-NEXT: movdqa %xmm5, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm12[0,0,2,2] +; SSE2-NEXT: pand %xmm10, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm8 +; SSE2-NEXT: pandn %xmm2, %xmm10 +; SSE2-NEXT: por %xmm8, %xmm10 +; SSE2-NEXT: packuswb %xmm11, %xmm10 +; SSE2-NEXT: packuswb %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pxor %xmm4, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm9, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm7 +; SSE2-NEXT: pandn %xmm2, %xmm9 +; SSE2-NEXT: por %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pxor %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm8 +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm7 +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: packuswb %xmm9, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm8 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm8 +; SSE2-NEXT: por %xmm3, %xmm8 ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm14, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm13, %xmm1 -; SSE2-NEXT: pxor %xmm14, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm13 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm13, %xmm3 -; SSE2-NEXT: packuswb %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm12, %xmm1 -; SSE2-NEXT: pxor %xmm14, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm12 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm12, %xmm3 -; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: pxor %xmm14, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm11 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm11, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm10, %xmm3 -; SSE2-NEXT: pxor %xmm14, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm10 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm10, %xmm4 -; SSE2-NEXT: movdqa %xmm9, %xmm3 -; SSE2-NEXT: pxor %xmm14, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm14, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm9 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm9, %xmm2 -; SSE2-NEXT: packuswb %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm8, %xmm4 +; SSE2-NEXT: packuswb %xmm4, %xmm7 +; SSE2-NEXT: packuswb %xmm7, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v16i64_v16i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa 96(%rdi), %xmm9 -; SSSE3-NEXT: movdqa 112(%rdi), %xmm10 -; SSSE3-NEXT: movdqa 64(%rdi), %xmm11 -; SSSE3-NEXT: movdqa 80(%rdi), %xmm12 -; SSSE3-NEXT: movdqa (%rdi), %xmm3 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm6 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm13 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm14 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm6, %xmm0 -; SSSE3-NEXT: pxor %xmm14, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm14, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711] -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm14, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm14, %xmm6 -; SSSE3-NEXT: movdqa %xmm2, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: packuswb %xmm5, %xmm0 +; SSSE3-NEXT: movdqa 96(%rdi), %xmm1 +; SSSE3-NEXT: movdqa 112(%rdi), %xmm3 +; SSSE3-NEXT: movdqa 64(%rdi), %xmm6 +; SSSE3-NEXT: movdqa 80(%rdi), %xmm7 +; SSSE3-NEXT: movdqa (%rdi), %xmm10 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 +; SSSE3-NEXT: movdqa 32(%rdi), %xmm8 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm0, %xmm11 +; SSSE3-NEXT: pxor %xmm4, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm12 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm5, %xmm13 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm13 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,0,2,2] +; SSSE3-NEXT: pand %xmm12, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm12 +; SSSE3-NEXT: pand %xmm12, %xmm0 +; SSSE3-NEXT: pandn %xmm2, %xmm12 +; SSSE3-NEXT: por %xmm0, %xmm12 +; SSSE3-NEXT: movdqa %xmm10, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm11 +; SSSE3-NEXT: movdqa %xmm5, %xmm13 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm13 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] +; SSSE3-NEXT: pand %xmm11, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm10 +; SSSE3-NEXT: pandn %xmm2, %xmm0 +; SSSE3-NEXT: por %xmm10, %xmm0 +; SSSE3-NEXT: packuswb %xmm12, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm10 +; SSSE3-NEXT: pxor %xmm4, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm11 +; SSSE3-NEXT: movdqa %xmm5, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,0,2,2] +; SSSE3-NEXT: pand %xmm11, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm11 +; SSSE3-NEXT: pand %xmm11, %xmm9 +; SSSE3-NEXT: pandn %xmm2, %xmm11 +; SSSE3-NEXT: por %xmm9, %xmm11 +; SSSE3-NEXT: movdqa %xmm8, %xmm9 +; SSSE3-NEXT: pxor %xmm4, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm10 +; SSSE3-NEXT: movdqa %xmm5, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm12[0,0,2,2] +; SSSE3-NEXT: pand %xmm10, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm10 +; SSSE3-NEXT: pand %xmm10, %xmm8 +; SSSE3-NEXT: pandn %xmm2, %xmm10 +; SSSE3-NEXT: por %xmm8, %xmm10 +; SSSE3-NEXT: packuswb %xmm11, %xmm10 +; SSSE3-NEXT: packuswb %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pxor %xmm4, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm9 +; SSSE3-NEXT: movdqa %xmm5, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm9, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm7 +; SSSE3-NEXT: pandn %xmm2, %xmm9 +; SSSE3-NEXT: por %xmm7, %xmm9 +; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: pxor %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm8 +; SSSE3-NEXT: movdqa %xmm5, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm6 +; SSSE3-NEXT: pandn %xmm2, %xmm7 +; SSSE3-NEXT: por %xmm6, %xmm7 +; SSSE3-NEXT: packuswb %xmm9, %xmm7 +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm8 +; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,2,2] +; SSSE3-NEXT: pand %xmm8, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm3 +; SSSE3-NEXT: pandn %xmm2, %xmm8 +; SSSE3-NEXT: por %xmm3, %xmm8 ; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pxor %xmm14, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm14, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm6, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; SSSE3-NEXT: por %xmm3, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: pandn %xmm2, %xmm4 ; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm13, %xmm1 -; SSSE3-NEXT: pxor %xmm14, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm14, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm13 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm13, %xmm3 -; SSSE3-NEXT: packuswb %xmm4, %xmm3 -; SSSE3-NEXT: packuswb %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm12, %xmm1 -; SSSE3-NEXT: pxor %xmm14, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm14, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm12 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm12, %xmm3 -; SSSE3-NEXT: movdqa %xmm11, %xmm1 -; SSSE3-NEXT: pxor %xmm14, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm14, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm11 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm11, %xmm1 -; SSSE3-NEXT: packuswb %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm10, %xmm3 -; SSSE3-NEXT: pxor %xmm14, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm14, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm10 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm10, %xmm4 -; SSSE3-NEXT: movdqa %xmm9, %xmm3 -; SSSE3-NEXT: pxor %xmm14, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm14, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm9 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm9, %xmm2 -; SSSE3-NEXT: packuswb %xmm4, %xmm2 -; SSSE3-NEXT: packuswb %xmm2, %xmm1 -; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: packuswb %xmm8, %xmm4 +; SSSE3-NEXT: packuswb %xmm4, %xmm7 +; SSSE3-NEXT: packuswb %xmm7, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v16i64_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa 96(%rdi), %xmm8 -; SSE41-NEXT: movdqa 112(%rdi), %xmm9 -; SSE41-NEXT: movdqa 64(%rdi), %xmm10 -; SSE41-NEXT: movdqa 80(%rdi), %xmm11 -; SSE41-NEXT: movdqa (%rdi), %xmm2 +; SSE41-NEXT: movdqa 96(%rdi), %xmm2 +; SSE41-NEXT: movdqa 112(%rdi), %xmm4 +; SSE41-NEXT: movdqa 64(%rdi), %xmm7 +; SSE41-NEXT: movdqa 80(%rdi), %xmm8 +; SSE41-NEXT: movdqa (%rdi), %xmm11 ; SSE41-NEXT: movdqa 16(%rdi), %xmm1 -; SSE41-NEXT: movdqa 32(%rdi), %xmm12 -; SSE41-NEXT: movdqa 48(%rdi), %xmm13 +; SSE41-NEXT: movdqa 32(%rdi), %xmm9 +; SSE41-NEXT: movdqa 48(%rdi), %xmm10 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm5, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 +; SSE41-NEXT: movdqa %xmm5, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm12, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm12 +; SSE41-NEXT: movdqa %xmm11, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm5, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: por %xmm13, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: packusdw %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm13, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1 +; SSE41-NEXT: packusdw %xmm12, %xmm1 +; SSE41-NEXT: movdqa %xmm10, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm2 -; SSE41-NEXT: movdqa %xmm12, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 +; SSE41-NEXT: movdqa %xmm5, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11 +; SSE41-NEXT: movdqa %xmm9, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 -; SSE41-NEXT: packusdw %xmm2, %xmm5 -; SSE41-NEXT: packusdw %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm11, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: movdqa %xmm5, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm10 +; SSE41-NEXT: packusdw %xmm11, %xmm10 +; SSE41-NEXT: packusdw %xmm10, %xmm1 +; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 -; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE41-NEXT: movdqa %xmm5, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2 -; SSE41-NEXT: packusdw %xmm5, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE41-NEXT: movdqa %xmm5, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: packusdw %xmm9, %xmm8 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5 -; SSE41-NEXT: pxor %xmm8, %xmm6 -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: packusdw %xmm5, %xmm3 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: packuswb %xmm2, %xmm1 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 +; SSE41-NEXT: pxor %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm7, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm8 +; SSE41-NEXT: packuswb %xmm8, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -4004,38 +4004,38 @@ ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: pxor %xmm4, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm8 +; SSE2-NEXT: por %xmm0, %xmm8 ; SSE2-NEXT: movdqa %xmm6, %xmm7 ; SSE2-NEXT: pxor %xmm4, %xmm7 -; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm0 ; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm6 -; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: packuswb %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm7 +; SSE2-NEXT: por %xmm5, %xmm7 ; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: packuswb %xmm6, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm7, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v16i32_v16i8: @@ -4044,38 +4044,38 @@ ; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm1 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255] ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm7 ; SSSE3-NEXT: pxor %xmm4, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm2, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pandn %xmm3, %xmm8 +; SSSE3-NEXT: por %xmm0, %xmm8 ; SSSE3-NEXT: movdqa %xmm6, %xmm7 ; SSSE3-NEXT: pxor %xmm4, %xmm7 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 ; SSSE3-NEXT: pand %xmm0, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: pandn %xmm3, %xmm0 ; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: packuswb %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm6 -; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: packuswb %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSSE3-NEXT: movdqa %xmm2, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm5 +; SSSE3-NEXT: pandn %xmm3, %xmm7 +; SSSE3-NEXT: por %xmm5, %xmm7 ; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: packuswb %xmm6, %xmm3 -; SSSE3-NEXT: packuswb %xmm3, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pandn %xmm3, %xmm2 +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: packuswb %xmm7, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v16i32_v16i8: @@ -4144,38 +4144,38 @@ ; SSE2-NEXT: movdqa 16(%rdi), %xmm5 ; SSE2-NEXT: movdqa 32(%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm5, %xmm7 ; SSE2-NEXT: pxor %xmm3, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm8 +; SSE2-NEXT: por %xmm5, %xmm8 ; SSE2-NEXT: movdqa %xmm6, %xmm7 ; SSE2-NEXT: pxor %xmm3, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm5 ; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: packuswb %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm6 -; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: packuswb %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm7 +; SSE2-NEXT: por %xmm4, %xmm7 ; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm6, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm7, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm5 ; SSE2-NEXT: movdqa %xmm5, (%rsi) ; SSE2-NEXT: retq ; @@ -4185,38 +4185,38 @@ ; SSSE3-NEXT: movdqa 16(%rdi), %xmm5 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm0 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm5, %xmm7 ; SSSE3-NEXT: pxor %xmm3, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pandn %xmm2, %xmm8 +; SSSE3-NEXT: por %xmm5, %xmm8 ; SSSE3-NEXT: movdqa %xmm6, %xmm7 ; SSSE3-NEXT: pxor %xmm3, %xmm7 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm1, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: pandn %xmm2, %xmm5 ; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: packuswb %xmm1, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm6 -; SSSE3-NEXT: por %xmm4, %xmm6 +; SSSE3-NEXT: packuswb %xmm8, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pxor %xmm3, %xmm6 +; SSSE3-NEXT: movdqa %xmm1, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pandn %xmm2, %xmm7 +; SSSE3-NEXT: por %xmm4, %xmm7 ; SSSE3-NEXT: pxor %xmm0, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: packuswb %xmm6, %xmm2 -; SSSE3-NEXT: packuswb %xmm2, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm2, %xmm1 +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: packuswb %xmm7, %xmm1 +; SSSE3-NEXT: packuswb %xmm1, %xmm5 ; SSSE3-NEXT: movdqa %xmm5, (%rsi) ; SSSE3-NEXT: retq ; @@ -4643,152 +4643,152 @@ define <32 x i8> @trunc_usat_v32i32_v32i8(ptr %p0) { ; SSE2-LABEL: trunc_usat_v32i32_v32i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm11 -; SSE2-NEXT: movdqa 16(%rdi), %xmm12 -; SSE2-NEXT: movdqa 32(%rdi), %xmm9 -; SSE2-NEXT: movdqa 48(%rdi), %xmm10 -; SSE2-NEXT: movdqa 96(%rdi), %xmm0 -; SSE2-NEXT: movdqa 112(%rdi), %xmm2 -; SSE2-NEXT: movdqa 64(%rdi), %xmm5 -; SSE2-NEXT: movdqa 80(%rdi), %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255] +; SSE2-NEXT: movdqa (%rdi), %xmm7 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm5 +; SSE2-NEXT: movdqa 96(%rdi), %xmm8 +; SSE2-NEXT: movdqa 112(%rdi), %xmm9 +; SSE2-NEXT: movdqa 64(%rdi), %xmm10 +; SSE2-NEXT: movdqa 80(%rdi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm7 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: pxor %xmm6, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm3, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm12 +; SSE2-NEXT: por %xmm1, %xmm12 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pxor %xmm6, %xmm11 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm10 +; SSE2-NEXT: pandn %xmm4, %xmm1 +; SSE2-NEXT: por %xmm10, %xmm1 +; SSE2-NEXT: packuswb %xmm12, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pxor %xmm6, %xmm10 +; SSE2-NEXT: movdqa %xmm3, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm9 +; SSE2-NEXT: pandn %xmm4, %xmm11 +; SSE2-NEXT: por %xmm9, %xmm11 +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: pxor %xmm6, %xmm9 +; SSE2-NEXT: movdqa %xmm3, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm8 +; SSE2-NEXT: pandn %xmm4, %xmm10 +; SSE2-NEXT: por %xmm8, %xmm10 +; SSE2-NEXT: packuswb %xmm11, %xmm10 +; SSE2-NEXT: packuswb %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pxor %xmm6, %xmm8 +; SSE2-NEXT: movdqa %xmm3, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm9 +; SSE2-NEXT: por %xmm0, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pxor %xmm6, %xmm8 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: packuswb %xmm9, %xmm0 ; SSE2-NEXT: movdqa %xmm5, %xmm7 ; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm6, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: packuswb %xmm5, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: pxor %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm12 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm12, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm3 -; SSE2-NEXT: pxor %xmm6, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm11, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm10 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm10, %xmm3 -; SSE2-NEXT: pxor %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm9 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: packuswb %xmm3, %xmm4 -; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm5, %xmm8 +; SSE2-NEXT: pxor %xmm2, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: packuswb %xmm8, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v32i32_v32i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm11 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm12 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm9 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm10 -; SSSE3-NEXT: movdqa 96(%rdi), %xmm0 -; SSSE3-NEXT: movdqa 112(%rdi), %xmm2 -; SSSE3-NEXT: movdqa 64(%rdi), %xmm5 -; SSSE3-NEXT: movdqa 80(%rdi), %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255] +; SSSE3-NEXT: movdqa (%rdi), %xmm7 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 +; SSSE3-NEXT: movdqa 32(%rdi), %xmm2 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm5 +; SSSE3-NEXT: movdqa 96(%rdi), %xmm8 +; SSSE3-NEXT: movdqa 112(%rdi), %xmm9 +; SSSE3-NEXT: movdqa 64(%rdi), %xmm10 +; SSSE3-NEXT: movdqa 80(%rdi), %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255] ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm7, %xmm1 -; SSSE3-NEXT: pxor %xmm6, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm4, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm7 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm11 +; SSSE3-NEXT: pxor %xmm6, %xmm11 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm3, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 +; SSSE3-NEXT: pand %xmm12, %xmm1 +; SSSE3-NEXT: pandn %xmm4, %xmm12 +; SSSE3-NEXT: por %xmm1, %xmm12 +; SSSE3-NEXT: movdqa %xmm10, %xmm11 +; SSSE3-NEXT: pxor %xmm6, %xmm11 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm10 +; SSSE3-NEXT: pandn %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm10, %xmm1 +; SSSE3-NEXT: packuswb %xmm12, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm10 +; SSSE3-NEXT: pxor %xmm6, %xmm10 +; SSSE3-NEXT: movdqa %xmm3, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11 +; SSSE3-NEXT: pand %xmm11, %xmm9 +; SSSE3-NEXT: pandn %xmm4, %xmm11 +; SSSE3-NEXT: por %xmm9, %xmm11 +; SSSE3-NEXT: movdqa %xmm8, %xmm9 +; SSSE3-NEXT: pxor %xmm6, %xmm9 +; SSSE3-NEXT: movdqa %xmm3, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 +; SSSE3-NEXT: pand %xmm10, %xmm8 +; SSSE3-NEXT: pandn %xmm4, %xmm10 +; SSSE3-NEXT: por %xmm8, %xmm10 +; SSSE3-NEXT: packuswb %xmm11, %xmm10 +; SSSE3-NEXT: packuswb %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm8 +; SSSE3-NEXT: pxor %xmm6, %xmm8 +; SSSE3-NEXT: movdqa %xmm3, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm9 +; SSSE3-NEXT: por %xmm0, %xmm9 +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pxor %xmm6, %xmm8 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm7 +; SSSE3-NEXT: pandn %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: packuswb %xmm9, %xmm0 ; SSSE3-NEXT: movdqa %xmm5, %xmm7 ; SSSE3-NEXT: pxor %xmm6, %xmm7 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: packuswb %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pxor %xmm6, %xmm3 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm4, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: packuswb %xmm5, %xmm3 -; SSSE3-NEXT: packuswb %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm12, %xmm0 -; SSSE3-NEXT: pxor %xmm6, %xmm0 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm12 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm12, %xmm2 -; SSSE3-NEXT: movdqa %xmm11, %xmm3 -; SSSE3-NEXT: pxor %xmm6, %xmm3 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm11 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm11, %xmm0 -; SSSE3-NEXT: packuswb %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm10, %xmm2 -; SSSE3-NEXT: pxor %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm4, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm10 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm10, %xmm3 -; SSSE3-NEXT: pxor %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm9 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm9, %xmm4 -; SSSE3-NEXT: packuswb %xmm3, %xmm4 -; SSSE3-NEXT: packuswb %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pandn %xmm4, %xmm8 +; SSSE3-NEXT: por %xmm5, %xmm8 +; SSSE3-NEXT: pxor %xmm2, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: packuswb %xmm8, %xmm3 +; SSSE3-NEXT: packuswb %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v32i32_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -2148,8 +2148,8 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE2-NEXT: movdqa %xmm0, %xmm5 @@ -2161,17 +2161,17 @@ ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: movdqa %xmm1, 112(%rdi) -; SSE2-NEXT: movdqa %xmm4, 96(%rdi) +; SSE2-NEXT: movdqa %xmm8, 96(%rdi) ; SSE2-NEXT: movdqa %xmm6, 80(%rdi) ; SSE2-NEXT: movdqa %xmm7, 64(%rdi) ; SSE2-NEXT: movdqa %xmm0, 48(%rdi) ; SSE2-NEXT: movdqa %xmm5, 32(%rdi) ; SSE2-NEXT: movdqa %xmm3, 16(%rdi) -; SSE2-NEXT: movdqa %xmm8, (%rdi) +; SSE2-NEXT: movdqa %xmm4, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_32i8_to_32i32: @@ -2180,8 +2180,8 @@ ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: movdqa %xmm3, %xmm8 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSSE3-NEXT: movdqa %xmm0, %xmm5 @@ -2193,17 +2193,17 @@ ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSSE3-NEXT: movdqa %xmm1, 112(%rdi) -; SSSE3-NEXT: movdqa %xmm4, 96(%rdi) +; SSSE3-NEXT: movdqa %xmm8, 96(%rdi) ; SSSE3-NEXT: movdqa %xmm6, 80(%rdi) ; SSSE3-NEXT: movdqa %xmm7, 64(%rdi) ; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) ; SSSE3-NEXT: movdqa %xmm5, 32(%rdi) ; SSSE3-NEXT: movdqa %xmm3, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm8, (%rdi) +; SSSE3-NEXT: movdqa %xmm4, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_32i8_to_32i32: diff --git a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll --- a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll +++ b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll @@ -66,11 +66,10 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rbp ; X64-NEXT: movq %rsp, %rbp -; X64-NEXT: pushq %r14 ; X64-NEXT: pushq %rbx ; X64-NEXT: andq $-64, %rsp ; X64-NEXT: subq $64, %rsp -; X64-NEXT: movq %rdi, %r14 +; X64-NEXT: movq %rdi, %rbx ; X64-NEXT: vmovaps 16(%rbp), %zmm8 ; X64-NEXT: vp2intersectd %zmm1, %zmm0, %k0 ; X64-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill @@ -104,16 +103,15 @@ ; X64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; X64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; X64-NEXT: kmovw %k0, %edi -; X64-NEXT: kmovw %k1, %ebx +; X64-NEXT: kmovw %k1, %r8d ; X64-NEXT: addl %edi, %eax ; X64-NEXT: addl %ecx, %edx -; X64-NEXT: addl %ebx, %eax +; X64-NEXT: addl %r8d, %eax ; X64-NEXT: addl %esi, %eax ; X64-NEXT: addl %edx, %eax -; X64-NEXT: movw %ax, (%r14) -; X64-NEXT: leaq -16(%rbp), %rsp +; X64-NEXT: movw %ax, (%rbx) +; X64-NEXT: leaq -8(%rbp), %rsp ; X64-NEXT: popq %rbx -; X64-NEXT: popq %r14 ; X64-NEXT: popq %rbp ; X64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/vselect-minmax.ll b/llvm/test/CodeGen/X86/vselect-minmax.ll --- a/llvm/test/CodeGen/X86/vselect-minmax.ll +++ b/llvm/test/CodeGen/X86/vselect-minmax.ll @@ -4196,14 +4196,14 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm6, %xmm5 @@ -4271,14 +4271,14 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm6, %xmm5 @@ -4346,14 +4346,14 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm2, %xmm5 @@ -4421,14 +4421,14 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm2, %xmm5 @@ -4503,21 +4503,21 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm6, %xmm5 @@ -4551,24 +4551,23 @@ ; ; SSE4-LABEL: test121: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm7, %xmm8 -; SSE4-NEXT: movdqa %xmm0, %xmm7 +; SSE4-NEXT: movdqa %xmm0, %xmm8 ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm7, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE4-NEXT: movdqa %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE4-NEXT: movdqa %xmm6, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 -; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: movapd %xmm5, %xmm1 ; SSE4-NEXT: movapd %xmm6, %xmm2 -; SSE4-NEXT: movapd %xmm8, %xmm3 +; SSE4-NEXT: movapd %xmm7, %xmm3 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test121: @@ -4624,21 +4623,21 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm6, %xmm5 @@ -4672,24 +4671,23 @@ ; ; SSE4-LABEL: test122: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm7, %xmm8 -; SSE4-NEXT: movdqa %xmm0, %xmm7 +; SSE4-NEXT: movdqa %xmm0, %xmm8 ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm7, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE4-NEXT: movdqa %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE4-NEXT: movdqa %xmm6, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 -; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: movapd %xmm5, %xmm1 ; SSE4-NEXT: movapd %xmm6, %xmm2 -; SSE4-NEXT: movapd %xmm8, %xmm3 +; SSE4-NEXT: movapd %xmm7, %xmm3 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test122: @@ -4745,21 +4743,21 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm2, %xmm5 @@ -4793,10 +4791,9 @@ ; ; SSE4-LABEL: test123: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm7, %xmm8 -; SSE4-NEXT: movdqa %xmm0, %xmm7 +; SSE4-NEXT: movdqa %xmm0, %xmm8 ; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE4-NEXT: movdqa %xmm1, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm5, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 @@ -4804,12 +4801,12 @@ ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm3, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE4-NEXT: pcmpgtq %xmm7, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: movapd %xmm5, %xmm1 ; SSE4-NEXT: movapd %xmm6, %xmm2 -; SSE4-NEXT: movapd %xmm8, %xmm3 +; SSE4-NEXT: movapd %xmm7, %xmm3 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test123: @@ -4865,21 +4862,21 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm2, %xmm5 @@ -4913,10 +4910,9 @@ ; ; SSE4-LABEL: test124: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm7, %xmm8 -; SSE4-NEXT: movdqa %xmm0, %xmm7 +; SSE4-NEXT: movdqa %xmm0, %xmm8 ; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE4-NEXT: movdqa %xmm1, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm5, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 @@ -4924,12 +4920,12 @@ ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm3, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE4-NEXT: pcmpgtq %xmm7, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: movapd %xmm5, %xmm1 ; SSE4-NEXT: movapd %xmm6, %xmm2 -; SSE4-NEXT: movapd %xmm8, %xmm3 +; SSE4-NEXT: movapd %xmm7, %xmm3 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test124: @@ -4985,21 +4981,21 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm6, %xmm5 @@ -5033,39 +5029,36 @@ ; ; SSE4-LABEL: test125: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm7, %xmm8 -; SSE4-NEXT: movdqa %xmm6, %xmm9 -; SSE4-NEXT: movdqa %xmm5, %xmm10 -; SSE4-NEXT: movdqa %xmm0, %xmm5 -; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: movdqa %xmm0, %xmm6 -; SSE4-NEXT: pxor %xmm7, %xmm6 +; SSE4-NEXT: movdqa %xmm0, %xmm9 +; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm0, %xmm10 +; SSE4-NEXT: pxor %xmm8, %xmm10 ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE4-NEXT: movdqa %xmm1, %xmm5 -; SSE4-NEXT: pxor %xmm7, %xmm5 -; SSE4-NEXT: movdqa %xmm10, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm5, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4 +; SSE4-NEXT: movdqa %xmm1, %xmm9 +; SSE4-NEXT: pxor %xmm8, %xmm9 +; SSE4-NEXT: movdqa %xmm5, %xmm0 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE4-NEXT: movdqa %xmm2, %xmm1 -; SSE4-NEXT: pxor %xmm7, %xmm1 -; SSE4-NEXT: movdqa %xmm9, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 +; SSE4-NEXT: pxor %xmm8, %xmm1 +; SSE4-NEXT: movdqa %xmm6, %xmm0 +; SSE4-NEXT: pxor %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm3, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm7 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm7 -; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm7, %xmm8 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm8 +; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 -; SSE4-NEXT: movapd %xmm10, %xmm1 -; SSE4-NEXT: movapd %xmm9, %xmm2 -; SSE4-NEXT: movapd %xmm8, %xmm3 +; SSE4-NEXT: movapd %xmm5, %xmm1 +; SSE4-NEXT: movapd %xmm6, %xmm2 +; SSE4-NEXT: movapd %xmm7, %xmm3 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test125: @@ -5135,21 +5128,21 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm6, %xmm5 @@ -5183,39 +5176,36 @@ ; ; SSE4-LABEL: test126: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm7, %xmm8 -; SSE4-NEXT: movdqa %xmm6, %xmm9 -; SSE4-NEXT: movdqa %xmm5, %xmm10 -; SSE4-NEXT: movdqa %xmm0, %xmm5 -; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: movdqa %xmm0, %xmm6 -; SSE4-NEXT: pxor %xmm7, %xmm6 +; SSE4-NEXT: movdqa %xmm0, %xmm9 +; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm0, %xmm10 +; SSE4-NEXT: pxor %xmm8, %xmm10 ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE4-NEXT: movdqa %xmm1, %xmm5 -; SSE4-NEXT: pxor %xmm7, %xmm5 -; SSE4-NEXT: movdqa %xmm10, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm5, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4 +; SSE4-NEXT: movdqa %xmm1, %xmm9 +; SSE4-NEXT: pxor %xmm8, %xmm9 +; SSE4-NEXT: movdqa %xmm5, %xmm0 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE4-NEXT: movdqa %xmm2, %xmm1 -; SSE4-NEXT: pxor %xmm7, %xmm1 -; SSE4-NEXT: movdqa %xmm9, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 +; SSE4-NEXT: pxor %xmm8, %xmm1 +; SSE4-NEXT: movdqa %xmm6, %xmm0 +; SSE4-NEXT: pxor %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm3, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm7 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm7 -; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm7, %xmm8 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm8 +; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 -; SSE4-NEXT: movapd %xmm10, %xmm1 -; SSE4-NEXT: movapd %xmm9, %xmm2 -; SSE4-NEXT: movapd %xmm8, %xmm3 +; SSE4-NEXT: movapd %xmm5, %xmm1 +; SSE4-NEXT: movapd %xmm6, %xmm2 +; SSE4-NEXT: movapd %xmm7, %xmm3 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test126: @@ -5285,21 +5275,21 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm2, %xmm5 @@ -5333,38 +5323,35 @@ ; ; SSE4-LABEL: test127: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm7, %xmm8 -; SSE4-NEXT: movdqa %xmm6, %xmm9 -; SSE4-NEXT: movdqa %xmm5, %xmm10 -; SSE4-NEXT: movdqa %xmm4, %xmm5 -; SSE4-NEXT: movdqa %xmm0, %xmm6 -; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: pxor %xmm7, %xmm4 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; SSE4-NEXT: movdqa %xmm10, %xmm4 -; SSE4-NEXT: pxor %xmm7, %xmm4 +; SSE4-NEXT: movdqa %xmm0, %xmm9 +; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm4, %xmm10 +; SSE4-NEXT: pxor %xmm8, %xmm10 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4 +; SSE4-NEXT: movdqa %xmm5, %xmm9 +; SSE4-NEXT: pxor %xmm8, %xmm9 ; SSE4-NEXT: movdqa %xmm1, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10 -; SSE4-NEXT: movdqa %xmm9, %xmm1 -; SSE4-NEXT: pxor %xmm7, %xmm1 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE4-NEXT: movdqa %xmm6, %xmm1 +; SSE4-NEXT: pxor %xmm8, %xmm1 ; SSE4-NEXT: movdqa %xmm2, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 +; SSE4-NEXT: pxor %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE4-NEXT: movdqa %xmm8, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pxor %xmm3, %xmm7 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm7 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 -; SSE4-NEXT: movapd %xmm5, %xmm0 -; SSE4-NEXT: movapd %xmm10, %xmm1 -; SSE4-NEXT: movapd %xmm9, %xmm2 -; SSE4-NEXT: movapd %xmm8, %xmm3 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm3, %xmm8 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm8 +; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE4-NEXT: movapd %xmm4, %xmm0 +; SSE4-NEXT: movapd %xmm5, %xmm1 +; SSE4-NEXT: movapd %xmm6, %xmm2 +; SSE4-NEXT: movapd %xmm7, %xmm3 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test127: @@ -5434,21 +5421,21 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm2, %xmm5 @@ -5482,38 +5469,35 @@ ; ; SSE4-LABEL: test128: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm7, %xmm8 -; SSE4-NEXT: movdqa %xmm6, %xmm9 -; SSE4-NEXT: movdqa %xmm5, %xmm10 -; SSE4-NEXT: movdqa %xmm4, %xmm5 -; SSE4-NEXT: movdqa %xmm0, %xmm6 -; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: pxor %xmm7, %xmm4 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; SSE4-NEXT: movdqa %xmm10, %xmm4 -; SSE4-NEXT: pxor %xmm7, %xmm4 +; SSE4-NEXT: movdqa %xmm0, %xmm9 +; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm4, %xmm10 +; SSE4-NEXT: pxor %xmm8, %xmm10 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4 +; SSE4-NEXT: movdqa %xmm5, %xmm9 +; SSE4-NEXT: pxor %xmm8, %xmm9 ; SSE4-NEXT: movdqa %xmm1, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10 -; SSE4-NEXT: movdqa %xmm9, %xmm1 -; SSE4-NEXT: pxor %xmm7, %xmm1 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE4-NEXT: movdqa %xmm6, %xmm1 +; SSE4-NEXT: pxor %xmm8, %xmm1 ; SSE4-NEXT: movdqa %xmm2, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 +; SSE4-NEXT: pxor %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE4-NEXT: movdqa %xmm8, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pxor %xmm3, %xmm7 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm7 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 -; SSE4-NEXT: movapd %xmm5, %xmm0 -; SSE4-NEXT: movapd %xmm10, %xmm1 -; SSE4-NEXT: movapd %xmm9, %xmm2 -; SSE4-NEXT: movapd %xmm8, %xmm3 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm3, %xmm8 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm8 +; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE4-NEXT: movapd %xmm4, %xmm0 +; SSE4-NEXT: movapd %xmm5, %xmm1 +; SSE4-NEXT: movapd %xmm6, %xmm2 +; SSE4-NEXT: movapd %xmm7, %xmm3 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test128: @@ -6748,14 +6732,14 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm2, %xmm5 @@ -6823,14 +6807,14 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm2, %xmm5 @@ -6898,14 +6882,14 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm6, %xmm5 @@ -6973,14 +6957,14 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm6, %xmm5 @@ -7057,21 +7041,21 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm2, %xmm5 @@ -7105,10 +7089,9 @@ ; ; SSE4-LABEL: test153: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm7, %xmm8 -; SSE4-NEXT: movdqa %xmm0, %xmm7 +; SSE4-NEXT: movdqa %xmm0, %xmm8 ; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE4-NEXT: movdqa %xmm1, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm5, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 @@ -7116,12 +7099,12 @@ ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm3, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE4-NEXT: pcmpgtq %xmm7, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: movapd %xmm5, %xmm1 ; SSE4-NEXT: movapd %xmm6, %xmm2 -; SSE4-NEXT: movapd %xmm8, %xmm3 +; SSE4-NEXT: movapd %xmm7, %xmm3 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test153: @@ -7177,21 +7160,21 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm2, %xmm5 @@ -7225,10 +7208,9 @@ ; ; SSE4-LABEL: test154: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm7, %xmm8 -; SSE4-NEXT: movdqa %xmm0, %xmm7 +; SSE4-NEXT: movdqa %xmm0, %xmm8 ; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE4-NEXT: movdqa %xmm1, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm5, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 @@ -7236,12 +7218,12 @@ ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm3, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE4-NEXT: pcmpgtq %xmm7, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: movapd %xmm5, %xmm1 ; SSE4-NEXT: movapd %xmm6, %xmm2 -; SSE4-NEXT: movapd %xmm8, %xmm3 +; SSE4-NEXT: movapd %xmm7, %xmm3 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test154: @@ -7297,21 +7279,21 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm6, %xmm5 @@ -7345,24 +7327,23 @@ ; ; SSE4-LABEL: test155: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm7, %xmm8 -; SSE4-NEXT: movdqa %xmm0, %xmm7 +; SSE4-NEXT: movdqa %xmm0, %xmm8 ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm7, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE4-NEXT: movdqa %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE4-NEXT: movdqa %xmm6, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 -; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: movapd %xmm5, %xmm1 ; SSE4-NEXT: movapd %xmm6, %xmm2 -; SSE4-NEXT: movapd %xmm8, %xmm3 +; SSE4-NEXT: movapd %xmm7, %xmm3 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test155: @@ -7418,21 +7399,21 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm2, %xmm5 @@ -7466,38 +7447,35 @@ ; ; SSE4-LABEL: test156: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm7, %xmm8 -; SSE4-NEXT: movdqa %xmm6, %xmm9 -; SSE4-NEXT: movdqa %xmm5, %xmm10 -; SSE4-NEXT: movdqa %xmm4, %xmm5 -; SSE4-NEXT: movdqa %xmm0, %xmm6 -; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: pxor %xmm7, %xmm4 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; SSE4-NEXT: movdqa %xmm10, %xmm4 -; SSE4-NEXT: pxor %xmm7, %xmm4 +; SSE4-NEXT: movdqa %xmm0, %xmm9 +; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm4, %xmm10 +; SSE4-NEXT: pxor %xmm8, %xmm10 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4 +; SSE4-NEXT: movdqa %xmm5, %xmm9 +; SSE4-NEXT: pxor %xmm8, %xmm9 ; SSE4-NEXT: movdqa %xmm1, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10 -; SSE4-NEXT: movdqa %xmm9, %xmm1 -; SSE4-NEXT: pxor %xmm7, %xmm1 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE4-NEXT: movdqa %xmm6, %xmm1 +; SSE4-NEXT: pxor %xmm8, %xmm1 ; SSE4-NEXT: movdqa %xmm2, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 +; SSE4-NEXT: pxor %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE4-NEXT: movdqa %xmm8, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pxor %xmm3, %xmm7 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm7 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 -; SSE4-NEXT: movapd %xmm5, %xmm0 -; SSE4-NEXT: movapd %xmm10, %xmm1 -; SSE4-NEXT: movapd %xmm9, %xmm2 -; SSE4-NEXT: movapd %xmm8, %xmm3 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm3, %xmm8 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm8 +; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE4-NEXT: movapd %xmm4, %xmm0 +; SSE4-NEXT: movapd %xmm5, %xmm1 +; SSE4-NEXT: movapd %xmm6, %xmm2 +; SSE4-NEXT: movapd %xmm7, %xmm3 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test156: @@ -7567,21 +7545,21 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm6, %xmm5 @@ -7615,39 +7593,36 @@ ; ; SSE4-LABEL: test159: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm7, %xmm8 -; SSE4-NEXT: movdqa %xmm6, %xmm9 -; SSE4-NEXT: movdqa %xmm5, %xmm10 -; SSE4-NEXT: movdqa %xmm0, %xmm5 -; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: movdqa %xmm0, %xmm6 -; SSE4-NEXT: pxor %xmm7, %xmm6 +; SSE4-NEXT: movdqa %xmm0, %xmm9 +; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm0, %xmm10 +; SSE4-NEXT: pxor %xmm8, %xmm10 ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE4-NEXT: movdqa %xmm1, %xmm5 -; SSE4-NEXT: pxor %xmm7, %xmm5 -; SSE4-NEXT: movdqa %xmm10, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm5, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4 +; SSE4-NEXT: movdqa %xmm1, %xmm9 +; SSE4-NEXT: pxor %xmm8, %xmm9 +; SSE4-NEXT: movdqa %xmm5, %xmm0 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE4-NEXT: movdqa %xmm2, %xmm1 -; SSE4-NEXT: pxor %xmm7, %xmm1 -; SSE4-NEXT: movdqa %xmm9, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 +; SSE4-NEXT: pxor %xmm8, %xmm1 +; SSE4-NEXT: movdqa %xmm6, %xmm0 +; SSE4-NEXT: pxor %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm3, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm7 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm7 -; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm7, %xmm8 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm8 +; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 -; SSE4-NEXT: movapd %xmm10, %xmm1 -; SSE4-NEXT: movapd %xmm9, %xmm2 -; SSE4-NEXT: movapd %xmm8, %xmm3 +; SSE4-NEXT: movapd %xmm5, %xmm1 +; SSE4-NEXT: movapd %xmm6, %xmm2 +; SSE4-NEXT: movapd %xmm7, %xmm3 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test159: @@ -7717,21 +7692,21 @@ ; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 ; SSE2-NEXT: movdqa %xmm6, %xmm5 @@ -7765,39 +7740,36 @@ ; ; SSE4-LABEL: test160: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movdqa %xmm7, %xmm8 -; SSE4-NEXT: movdqa %xmm6, %xmm9 -; SSE4-NEXT: movdqa %xmm5, %xmm10 -; SSE4-NEXT: movdqa %xmm0, %xmm5 -; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: movdqa %xmm0, %xmm6 -; SSE4-NEXT: pxor %xmm7, %xmm6 +; SSE4-NEXT: movdqa %xmm0, %xmm9 +; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm0, %xmm10 +; SSE4-NEXT: pxor %xmm8, %xmm10 ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE4-NEXT: movdqa %xmm1, %xmm5 -; SSE4-NEXT: pxor %xmm7, %xmm5 -; SSE4-NEXT: movdqa %xmm10, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm5, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm4 +; SSE4-NEXT: movdqa %xmm1, %xmm9 +; SSE4-NEXT: pxor %xmm8, %xmm9 +; SSE4-NEXT: movdqa %xmm5, %xmm0 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm9, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 ; SSE4-NEXT: movdqa %xmm2, %xmm1 -; SSE4-NEXT: pxor %xmm7, %xmm1 -; SSE4-NEXT: movdqa %xmm9, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 +; SSE4-NEXT: pxor %xmm8, %xmm1 +; SSE4-NEXT: movdqa %xmm6, %xmm0 +; SSE4-NEXT: pxor %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm3, %xmm0 -; SSE4-NEXT: pxor %xmm7, %xmm0 -; SSE4-NEXT: pxor %xmm8, %xmm7 -; SSE4-NEXT: pcmpgtq %xmm0, %xmm7 -; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE4-NEXT: pxor %xmm8, %xmm0 +; SSE4-NEXT: pxor %xmm7, %xmm8 +; SSE4-NEXT: pcmpgtq %xmm0, %xmm8 +; SSE4-NEXT: movdqa %xmm8, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE4-NEXT: movapd %xmm4, %xmm0 -; SSE4-NEXT: movapd %xmm10, %xmm1 -; SSE4-NEXT: movapd %xmm9, %xmm2 -; SSE4-NEXT: movapd %xmm8, %xmm3 +; SSE4-NEXT: movapd %xmm5, %xmm1 +; SSE4-NEXT: movapd %xmm6, %xmm2 +; SSE4-NEXT: movapd %xmm7, %xmm3 ; SSE4-NEXT: retq ; ; AVX1-LABEL: test160: @@ -10288,53 +10260,53 @@ define <8 x i64> @concat_smin_smax(<4 x i64> %a0, <4 x i64> %a1) { ; SSE2-LABEL: concat_smin_smax: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm11, %xmm6 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: pxor %xmm11, %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm6 +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: pxor %xmm7, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE2-NEXT: movdqa %xmm9, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm8, %xmm4 -; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm11, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm11 -; SSE2-NEXT: movdqa %xmm11, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2] -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm12 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm7, %xmm9 +; SSE2-NEXT: pxor %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,0,2,2] +; SSE2-NEXT: movdqa %xmm7, %xmm12 +; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm11 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm12, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pand %xmm5, %xmm7 +; SSE2-NEXT: por %xmm11, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: pand %xmm5, %xmm11 ; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm7, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: por %xmm11, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE2-NEXT: pand %xmm10, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm8 -; SSE2-NEXT: pandn %xmm2, %xmm6 ; SSE2-NEXT: por %xmm8, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,2,2] +; SSE2-NEXT: pand %xmm12, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm4 @@ -10346,7 +10318,7 @@ ; ; SSE4-LABEL: concat_smin_smax: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa %xmm0, %xmm8 +; SSE4-NEXT: movdqa %xmm0, %xmm4 ; SSE4-NEXT: movdqa %xmm0, %xmm5 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm5 ; SSE4-NEXT: movdqa %xmm0, %xmm6 @@ -10354,15 +10326,15 @@ ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE4-NEXT: movdqa %xmm1, %xmm7 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm7 -; SSE4-NEXT: movdqa %xmm1, %xmm4 +; SSE4-NEXT: movdqa %xmm1, %xmm8 ; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm4 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 ; SSE4-NEXT: movdqa %xmm5, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm2 +; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE4-NEXT: movapd %xmm6, %xmm0 -; SSE4-NEXT: movapd %xmm4, %xmm1 +; SSE4-NEXT: movapd %xmm8, %xmm1 ; SSE4-NEXT: retq ; ; AVX1-LABEL: concat_smin_smax: diff --git a/llvm/test/CodeGen/X86/vselect-packss.ll b/llvm/test/CodeGen/X86/vselect-packss.ll --- a/llvm/test/CodeGen/X86/vselect-packss.ll +++ b/llvm/test/CodeGen/X86/vselect-packss.ll @@ -264,13 +264,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9 ; AVX1-NEXT: vpcmpeqq %xmm8, %xmm9, %xmm8 ; AVX1-NEXT: vpcmpeqq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm8 +; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpcmpeqq %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 +; AVX1-NEXT: vpcmpeqq %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpcmpeqq %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpackssdw %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm8, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm6, %xmm3 diff --git a/llvm/test/CodeGen/X86/x86-cmov-converter.ll b/llvm/test/CodeGen/X86/x86-cmov-converter.ll --- a/llvm/test/CodeGen/X86/x86-cmov-converter.ll +++ b/llvm/test/CodeGen/X86/x86-cmov-converter.ll @@ -109,25 +109,25 @@ ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: jle .LBB0_5 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: movl %edi, %r8d +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .LBB0_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl (%rcx,%rdi,4), %eax -; CHECK-NEXT: leal 1(%rax), %r9d -; CHECK-NEXT: imull %esi, %eax -; CHECK-NEXT: movl $10, %r10d -; CHECK-NEXT: cmpl %edx, %eax +; CHECK-NEXT: movl (%rcx,%rdi,4), %r10d +; CHECK-NEXT: leal 1(%r10), %r8d +; CHECK-NEXT: imull %esi, %r10d +; CHECK-NEXT: movl $10, %r9d +; CHECK-NEXT: cmpl %edx, %r10d ; CHECK-NEXT: jg .LBB0_4 ; CHECK-NEXT: # %bb.3: # %for.body ; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: movl %r9d, %r10d +; CHECK-NEXT: movl %r8d, %r9d ; CHECK-NEXT: .LBB0_4: # %for.body ; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: imull %r9d, %r10d -; CHECK-NEXT: movl %r10d, (%rcx,%rdi,4) +; CHECK-NEXT: imull %r8d, %r9d +; CHECK-NEXT: movl %r9d, (%rcx,%rdi,4) ; CHECK-NEXT: addq $1, %rdi -; CHECK-NEXT: cmpq %rdi, %r8 +; CHECK-NEXT: cmpq %rdi, %rax ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: .LBB0_5: # %for.cond.cleanup ; CHECK-NEXT: retq @@ -137,25 +137,25 @@ ; CHECK-FORCEALL-NEXT: testl %edi, %edi ; CHECK-FORCEALL-NEXT: jle .LBB0_5 ; CHECK-FORCEALL-NEXT: # %bb.1: # %for.body.preheader -; CHECK-FORCEALL-NEXT: movl %edi, %r8d +; CHECK-FORCEALL-NEXT: movl %edi, %eax ; CHECK-FORCEALL-NEXT: xorl %edi, %edi ; CHECK-FORCEALL-NEXT: .LBB0_2: # %for.body ; CHECK-FORCEALL-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-FORCEALL-NEXT: movl (%rcx,%rdi,4), %eax -; CHECK-FORCEALL-NEXT: leal 1(%rax), %r9d -; CHECK-FORCEALL-NEXT: imull %esi, %eax -; CHECK-FORCEALL-NEXT: movl $10, %r10d -; CHECK-FORCEALL-NEXT: cmpl %edx, %eax +; CHECK-FORCEALL-NEXT: movl (%rcx,%rdi,4), %r10d +; CHECK-FORCEALL-NEXT: leal 1(%r10), %r8d +; CHECK-FORCEALL-NEXT: imull %esi, %r10d +; CHECK-FORCEALL-NEXT: movl $10, %r9d +; CHECK-FORCEALL-NEXT: cmpl %edx, %r10d ; CHECK-FORCEALL-NEXT: jg .LBB0_4 ; CHECK-FORCEALL-NEXT: # %bb.3: # %for.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB0_2 Depth=1 -; CHECK-FORCEALL-NEXT: movl %r9d, %r10d +; CHECK-FORCEALL-NEXT: movl %r8d, %r9d ; CHECK-FORCEALL-NEXT: .LBB0_4: # %for.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB0_2 Depth=1 -; CHECK-FORCEALL-NEXT: imull %r9d, %r10d -; CHECK-FORCEALL-NEXT: movl %r10d, (%rcx,%rdi,4) +; CHECK-FORCEALL-NEXT: imull %r8d, %r9d +; CHECK-FORCEALL-NEXT: movl %r9d, (%rcx,%rdi,4) ; CHECK-FORCEALL-NEXT: addq $1, %rdi -; CHECK-FORCEALL-NEXT: cmpq %rdi, %r8 +; CHECK-FORCEALL-NEXT: cmpq %rdi, %rax ; CHECK-FORCEALL-NEXT: jne .LBB0_2 ; CHECK-FORCEALL-NEXT: .LBB0_5: # %for.cond.cleanup ; CHECK-FORCEALL-NEXT: retq @@ -192,23 +192,23 @@ ; CHECK-NEXT: jle .LBB1_3 ; CHECK-NEXT: # %bb.1: # %for.body.preheader ; CHECK-NEXT: movl %edx, %r9d -; CHECK-NEXT: movl %edi, %r10d -; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: movl %edi, %edi +; CHECK-NEXT: xorl %r10d, %r10d ; CHECK-NEXT: movl $10, %r11d ; CHECK-NEXT: .LBB1_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl (%rcx,%rdi,4), %eax +; CHECK-NEXT: movl (%rcx,%r10,4), %eax ; CHECK-NEXT: movl %eax, %edx ; CHECK-NEXT: imull %esi, %edx ; CHECK-NEXT: cmpl %r9d, %edx ; CHECK-NEXT: cmovgl %r11d, %eax -; CHECK-NEXT: movl %eax, (%rcx,%rdi,4) -; CHECK-NEXT: movl (%r8,%rdi,4), %eax +; CHECK-NEXT: movl %eax, (%rcx,%r10,4) +; CHECK-NEXT: movl (%r8,%r10,4), %eax ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %r9d -; CHECK-NEXT: movl %eax, (%r8,%rdi,4) -; CHECK-NEXT: addq $1, %rdi -; CHECK-NEXT: cmpq %rdi, %r10 +; CHECK-NEXT: movl %eax, (%r8,%r10,4) +; CHECK-NEXT: addq $1, %r10 +; CHECK-NEXT: cmpq %r10, %rdi ; CHECK-NEXT: jne .LBB1_2 ; CHECK-NEXT: .LBB1_3: # %for.cond.cleanup ; CHECK-NEXT: retq @@ -219,28 +219,28 @@ ; CHECK-FORCEALL-NEXT: jle .LBB1_5 ; CHECK-FORCEALL-NEXT: # %bb.1: # %for.body.preheader ; CHECK-FORCEALL-NEXT: movl %edx, %r9d -; CHECK-FORCEALL-NEXT: movl %edi, %r10d -; CHECK-FORCEALL-NEXT: xorl %edi, %edi +; CHECK-FORCEALL-NEXT: movl %edi, %edi +; CHECK-FORCEALL-NEXT: xorl %r10d, %r10d ; CHECK-FORCEALL-NEXT: .LBB1_2: # %for.body ; CHECK-FORCEALL-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-FORCEALL-NEXT: movl (%rcx,%rdi,4), %r11d -; CHECK-FORCEALL-NEXT: movl %r11d, %eax -; CHECK-FORCEALL-NEXT: imull %esi, %eax +; CHECK-FORCEALL-NEXT: movl (%rcx,%r10,4), %eax +; CHECK-FORCEALL-NEXT: movl %eax, %r11d +; CHECK-FORCEALL-NEXT: imull %esi, %r11d ; CHECK-FORCEALL-NEXT: movl $10, %edx -; CHECK-FORCEALL-NEXT: cmpl %r9d, %eax +; CHECK-FORCEALL-NEXT: cmpl %r9d, %r11d ; CHECK-FORCEALL-NEXT: jg .LBB1_4 ; CHECK-FORCEALL-NEXT: # %bb.3: # %for.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-FORCEALL-NEXT: movl %r11d, %edx +; CHECK-FORCEALL-NEXT: movl %eax, %edx ; CHECK-FORCEALL-NEXT: .LBB1_4: # %for.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-FORCEALL-NEXT: movl %edx, (%rcx,%rdi,4) -; CHECK-FORCEALL-NEXT: movl (%r8,%rdi,4), %eax +; CHECK-FORCEALL-NEXT: movl %edx, (%rcx,%r10,4) +; CHECK-FORCEALL-NEXT: movl (%r8,%r10,4), %eax ; CHECK-FORCEALL-NEXT: cltd ; CHECK-FORCEALL-NEXT: idivl %r9d -; CHECK-FORCEALL-NEXT: movl %eax, (%r8,%rdi,4) -; CHECK-FORCEALL-NEXT: addq $1, %rdi -; CHECK-FORCEALL-NEXT: cmpq %rdi, %r10 +; CHECK-FORCEALL-NEXT: movl %eax, (%r8,%r10,4) +; CHECK-FORCEALL-NEXT: addq $1, %r10 +; CHECK-FORCEALL-NEXT: cmpq %r10, %rdi ; CHECK-FORCEALL-NEXT: jne .LBB1_2 ; CHECK-FORCEALL-NEXT: .LBB1_5: # %for.cond.cleanup ; CHECK-FORCEALL-NEXT: retq @@ -279,15 +279,15 @@ ; CHECK-NEXT: cmpl $2, %edi ; CHECK-NEXT: jl .LBB2_5 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: movl %edi, %r8d +; CHECK-NEXT: movl %edi, %ecx ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: movl $1, %edx ; CHECK-NEXT: .LBB2_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl (%rsi,%rdx,4), %r9d -; CHECK-NEXT: movslq %edi, %rcx +; CHECK-NEXT: movl (%rsi,%rdx,4), %r8d +; CHECK-NEXT: movslq %edi, %r9 ; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: cmpl (%rsi,%rcx,4), %r9d +; CHECK-NEXT: cmpl (%rsi,%r9,4), %r8d ; CHECK-NEXT: jg .LBB2_4 ; CHECK-NEXT: # %bb.3: # %for.body ; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 @@ -296,7 +296,7 @@ ; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 ; CHECK-NEXT: addq $1, %rdx ; CHECK-NEXT: movl %eax, %edi -; CHECK-NEXT: cmpq %rdx, %r8 +; CHECK-NEXT: cmpq %rdx, %rcx ; CHECK-NEXT: jne .LBB2_2 ; CHECK-NEXT: .LBB2_5: # %for.cond.cleanup ; CHECK-NEXT: retq @@ -307,15 +307,15 @@ ; CHECK-FORCEALL-NEXT: cmpl $2, %edi ; CHECK-FORCEALL-NEXT: jl .LBB2_5 ; CHECK-FORCEALL-NEXT: # %bb.1: # %for.body.preheader -; CHECK-FORCEALL-NEXT: movl %edi, %r8d +; CHECK-FORCEALL-NEXT: movl %edi, %ecx ; CHECK-FORCEALL-NEXT: xorl %edi, %edi ; CHECK-FORCEALL-NEXT: movl $1, %edx ; CHECK-FORCEALL-NEXT: .LBB2_2: # %for.body ; CHECK-FORCEALL-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-FORCEALL-NEXT: movl (%rsi,%rdx,4), %r9d -; CHECK-FORCEALL-NEXT: movslq %edi, %rcx +; CHECK-FORCEALL-NEXT: movl (%rsi,%rdx,4), %r8d +; CHECK-FORCEALL-NEXT: movslq %edi, %r9 ; CHECK-FORCEALL-NEXT: movl %edx, %eax -; CHECK-FORCEALL-NEXT: cmpl (%rsi,%rcx,4), %r9d +; CHECK-FORCEALL-NEXT: cmpl (%rsi,%r9,4), %r8d ; CHECK-FORCEALL-NEXT: jg .LBB2_4 ; CHECK-FORCEALL-NEXT: # %bb.3: # %for.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB2_2 Depth=1 @@ -324,7 +324,7 @@ ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB2_2 Depth=1 ; CHECK-FORCEALL-NEXT: addq $1, %rdx ; CHECK-FORCEALL-NEXT: movl %eax, %edi -; CHECK-FORCEALL-NEXT: cmpq %rdx, %r8 +; CHECK-FORCEALL-NEXT: cmpq %rdx, %rcx ; CHECK-FORCEALL-NEXT: jne .LBB2_2 ; CHECK-FORCEALL-NEXT: .LBB2_5: # %for.cond.cleanup ; CHECK-FORCEALL-NEXT: retq @@ -364,15 +364,15 @@ ; CHECK-NEXT: cmpl $2, %edi ; CHECK-NEXT: jl .LBB3_5 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: movl %edi, %r8d +; CHECK-NEXT: movl %edi, %ecx ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: movl $1, %edx ; CHECK-NEXT: .LBB3_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl (%rsi,%rdx,4), %r9d -; CHECK-NEXT: movslq %edi, %rcx +; CHECK-NEXT: movl (%rsi,%rdx,4), %r8d +; CHECK-NEXT: movslq %edi, %r9 ; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: cmpl (%rsi,%rcx,4), %r9d +; CHECK-NEXT: cmpl (%rsi,%r9,4), %r8d ; CHECK-NEXT: jg .LBB3_4 ; CHECK-NEXT: # %bb.3: # %for.body ; CHECK-NEXT: # in Loop: Header=BB3_2 Depth=1 @@ -381,7 +381,7 @@ ; CHECK-NEXT: # in Loop: Header=BB3_2 Depth=1 ; CHECK-NEXT: addq $1, %rdx ; CHECK-NEXT: movl %eax, %edi -; CHECK-NEXT: cmpq %rdx, %r8 +; CHECK-NEXT: cmpq %rdx, %rcx ; CHECK-NEXT: jne .LBB3_2 ; CHECK-NEXT: .LBB3_5: # %for.cond.cleanup ; CHECK-NEXT: retq @@ -392,15 +392,15 @@ ; CHECK-FORCEALL-NEXT: cmpl $2, %edi ; CHECK-FORCEALL-NEXT: jl .LBB3_5 ; CHECK-FORCEALL-NEXT: # %bb.1: # %for.body.preheader -; CHECK-FORCEALL-NEXT: movl %edi, %r8d +; CHECK-FORCEALL-NEXT: movl %edi, %ecx ; CHECK-FORCEALL-NEXT: xorl %edi, %edi ; CHECK-FORCEALL-NEXT: movl $1, %edx ; CHECK-FORCEALL-NEXT: .LBB3_2: # %for.body ; CHECK-FORCEALL-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-FORCEALL-NEXT: movl (%rsi,%rdx,4), %r9d -; CHECK-FORCEALL-NEXT: movslq %edi, %rcx +; CHECK-FORCEALL-NEXT: movl (%rsi,%rdx,4), %r8d +; CHECK-FORCEALL-NEXT: movslq %edi, %r9 ; CHECK-FORCEALL-NEXT: movl %edx, %eax -; CHECK-FORCEALL-NEXT: cmpl (%rsi,%rcx,4), %r9d +; CHECK-FORCEALL-NEXT: cmpl (%rsi,%r9,4), %r8d ; CHECK-FORCEALL-NEXT: jg .LBB3_4 ; CHECK-FORCEALL-NEXT: # %bb.3: # %for.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB3_2 Depth=1 @@ -409,7 +409,7 @@ ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB3_2 Depth=1 ; CHECK-FORCEALL-NEXT: addq $1, %rdx ; CHECK-FORCEALL-NEXT: movl %eax, %edi -; CHECK-FORCEALL-NEXT: cmpq %rdx, %r8 +; CHECK-FORCEALL-NEXT: cmpq %rdx, %rcx ; CHECK-FORCEALL-NEXT: jne .LBB3_2 ; CHECK-FORCEALL-NEXT: .LBB3_5: # %for.cond.cleanup ; CHECK-FORCEALL-NEXT: retq @@ -463,29 +463,29 @@ ; ; CHECK-FORCEALL-LABEL: MaxValue: ; CHECK-FORCEALL: # %bb.0: # %entry -; CHECK-FORCEALL-NEXT: movl (%rsi), %ecx +; CHECK-FORCEALL-NEXT: movl (%rsi), %r8d ; CHECK-FORCEALL-NEXT: cmpl $2, %edi ; CHECK-FORCEALL-NEXT: jge .LBB4_3 ; CHECK-FORCEALL-NEXT: # %bb.1: -; CHECK-FORCEALL-NEXT: movl %ecx, %eax +; CHECK-FORCEALL-NEXT: movl %r8d, %eax ; CHECK-FORCEALL-NEXT: .LBB4_2: # %for.cond.cleanup ; CHECK-FORCEALL-NEXT: retq ; CHECK-FORCEALL-NEXT: .LBB4_3: # %for.body.preheader -; CHECK-FORCEALL-NEXT: movl %edi, %edi +; CHECK-FORCEALL-NEXT: movl %edi, %ecx ; CHECK-FORCEALL-NEXT: movl $1, %edx ; CHECK-FORCEALL-NEXT: .LBB4_4: # %for.body ; CHECK-FORCEALL-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-FORCEALL-NEXT: movl (%rsi,%rdx,4), %eax -; CHECK-FORCEALL-NEXT: cmpl %ecx, %eax +; CHECK-FORCEALL-NEXT: cmpl %r8d, %eax ; CHECK-FORCEALL-NEXT: jg .LBB4_6 ; CHECK-FORCEALL-NEXT: # %bb.5: # %for.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB4_4 Depth=1 -; CHECK-FORCEALL-NEXT: movl %ecx, %eax +; CHECK-FORCEALL-NEXT: movl %r8d, %eax ; CHECK-FORCEALL-NEXT: .LBB4_6: # %for.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB4_4 Depth=1 ; CHECK-FORCEALL-NEXT: addq $1, %rdx -; CHECK-FORCEALL-NEXT: movl %eax, %ecx -; CHECK-FORCEALL-NEXT: cmpq %rdx, %rdi +; CHECK-FORCEALL-NEXT: movl %eax, %r8d +; CHECK-FORCEALL-NEXT: cmpq %rdx, %rcx ; CHECK-FORCEALL-NEXT: je .LBB4_2 ; CHECK-FORCEALL-NEXT: jmp .LBB4_4 entry: @@ -611,27 +611,27 @@ ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB6_5 ; CHECK-NEXT: # %bb.1: # %while.body.preheader -; CHECK-NEXT: movl %edx, %r8d +; CHECK-NEXT: movl %edx, %ecx ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .LBB6_2: # %while.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: movslq %esi, %rsi ; CHECK-NEXT: movl (%rdi,%rsi,4), %eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: divl %r8d +; CHECK-NEXT: divl %ecx ; CHECK-NEXT: movl %eax, %edx ; CHECK-NEXT: movl $11, %eax -; CHECK-NEXT: movl %r8d, %ecx -; CHECK-NEXT: cmpl %r8d, %edx +; CHECK-NEXT: movl %ecx, %r8d +; CHECK-NEXT: cmpl %ecx, %edx ; CHECK-NEXT: ja .LBB6_4 ; CHECK-NEXT: # %bb.3: # %while.body ; CHECK-NEXT: # in Loop: Header=BB6_2 Depth=1 ; CHECK-NEXT: movl $22, %eax -; CHECK-NEXT: movl $22, %ecx +; CHECK-NEXT: movl $22, %r8d ; CHECK-NEXT: .LBB6_4: # %while.body ; CHECK-NEXT: # in Loop: Header=BB6_2 Depth=1 ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: divl %ecx +; CHECK-NEXT: divl %r8d ; CHECK-NEXT: movl %edx, (%rdi,%rsi,4) ; CHECK-NEXT: addl $1, %esi ; CHECK-NEXT: cmpl %r9d, %esi @@ -645,27 +645,27 @@ ; CHECK-FORCEALL-NEXT: testb %al, %al ; CHECK-FORCEALL-NEXT: jne .LBB6_5 ; CHECK-FORCEALL-NEXT: # %bb.1: # %while.body.preheader -; CHECK-FORCEALL-NEXT: movl %edx, %r8d +; CHECK-FORCEALL-NEXT: movl %edx, %ecx ; CHECK-FORCEALL-NEXT: xorl %esi, %esi ; CHECK-FORCEALL-NEXT: .LBB6_2: # %while.body ; CHECK-FORCEALL-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-FORCEALL-NEXT: movslq %esi, %rsi ; CHECK-FORCEALL-NEXT: movl (%rdi,%rsi,4), %eax ; CHECK-FORCEALL-NEXT: xorl %edx, %edx -; CHECK-FORCEALL-NEXT: divl %r8d +; CHECK-FORCEALL-NEXT: divl %ecx ; CHECK-FORCEALL-NEXT: movl %eax, %edx ; CHECK-FORCEALL-NEXT: movl $11, %eax -; CHECK-FORCEALL-NEXT: movl %r8d, %ecx -; CHECK-FORCEALL-NEXT: cmpl %r8d, %edx +; CHECK-FORCEALL-NEXT: movl %ecx, %r8d +; CHECK-FORCEALL-NEXT: cmpl %ecx, %edx ; CHECK-FORCEALL-NEXT: ja .LBB6_4 ; CHECK-FORCEALL-NEXT: # %bb.3: # %while.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB6_2 Depth=1 ; CHECK-FORCEALL-NEXT: movl $22, %eax -; CHECK-FORCEALL-NEXT: movl $22, %ecx +; CHECK-FORCEALL-NEXT: movl $22, %r8d ; CHECK-FORCEALL-NEXT: .LBB6_4: # %while.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB6_2 Depth=1 ; CHECK-FORCEALL-NEXT: xorl %edx, %edx -; CHECK-FORCEALL-NEXT: divl %ecx +; CHECK-FORCEALL-NEXT: divl %r8d ; CHECK-FORCEALL-NEXT: movl %edx, (%rdi,%rsi,4) ; CHECK-FORCEALL-NEXT: addl $1, %esi ; CHECK-FORCEALL-NEXT: cmpl %r9d, %esi @@ -762,12 +762,12 @@ ; CHECK-NEXT: cmpl %esi, %edi ; CHECK-NEXT: ja .LBB9_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: movl (%rcx), %r8d +; CHECK-NEXT: movl (%rcx), %edx ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movl %esi, %r8d ; CHECK-NEXT: .LBB9_2: # %entry -; CHECK-NEXT: addl %r8d, %eax ; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: addl %r8d, %eax ; CHECK-NEXT: retq ; ; CHECK-FORCEALL-LABEL: test_cmov_memoperand_in_group: @@ -777,12 +777,12 @@ ; CHECK-FORCEALL-NEXT: cmpl %esi, %edi ; CHECK-FORCEALL-NEXT: ja .LBB9_2 ; CHECK-FORCEALL-NEXT: # %bb.1: # %entry -; CHECK-FORCEALL-NEXT: movl (%rcx), %r8d +; CHECK-FORCEALL-NEXT: movl (%rcx), %edx ; CHECK-FORCEALL-NEXT: movl %edi, %eax -; CHECK-FORCEALL-NEXT: movl %esi, %edx +; CHECK-FORCEALL-NEXT: movl %esi, %r8d ; CHECK-FORCEALL-NEXT: .LBB9_2: # %entry -; CHECK-FORCEALL-NEXT: addl %r8d, %eax ; CHECK-FORCEALL-NEXT: addl %edx, %eax +; CHECK-FORCEALL-NEXT: addl %r8d, %eax ; CHECK-FORCEALL-NEXT: retq entry: %cond = icmp ugt i32 %a, %b @@ -804,12 +804,12 @@ ; CHECK-NEXT: cmpl %esi, %edi ; CHECK-NEXT: jbe .LBB10_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: movl (%rcx), %r8d +; CHECK-NEXT: movl (%rcx), %edx ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movl %esi, %r8d ; CHECK-NEXT: .LBB10_2: # %entry -; CHECK-NEXT: addl %r8d, %eax ; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: addl %r8d, %eax ; CHECK-NEXT: retq ; ; CHECK-FORCEALL-LABEL: test_cmov_memoperand_in_group2: @@ -819,12 +819,12 @@ ; CHECK-FORCEALL-NEXT: cmpl %esi, %edi ; CHECK-FORCEALL-NEXT: jbe .LBB10_2 ; CHECK-FORCEALL-NEXT: # %bb.1: # %entry -; CHECK-FORCEALL-NEXT: movl (%rcx), %r8d +; CHECK-FORCEALL-NEXT: movl (%rcx), %edx ; CHECK-FORCEALL-NEXT: movl %edi, %eax -; CHECK-FORCEALL-NEXT: movl %esi, %edx +; CHECK-FORCEALL-NEXT: movl %esi, %r8d ; CHECK-FORCEALL-NEXT: .LBB10_2: # %entry -; CHECK-FORCEALL-NEXT: addl %r8d, %eax ; CHECK-FORCEALL-NEXT: addl %edx, %eax +; CHECK-FORCEALL-NEXT: addl %r8d, %eax ; CHECK-FORCEALL-NEXT: retq entry: %cond = icmp ugt i32 %a, %b @@ -969,32 +969,32 @@ define void @test_memoperand_loop(i32 %data) #0 { ; CHECK-LABEL: test_memoperand_loop: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq begin@GOTPCREL(%rip), %r8 -; CHECK-NEXT: movq (%r8), %rax -; CHECK-NEXT: movq end@GOTPCREL(%rip), %rcx -; CHECK-NEXT: movq (%rcx), %rdx +; CHECK-NEXT: movq begin@GOTPCREL(%rip), %rax +; CHECK-NEXT: movq (%rax), %rcx +; CHECK-NEXT: movq end@GOTPCREL(%rip), %rdx +; CHECK-NEXT: movq (%rdx), %rdx ; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: movq %rcx, %r8 ; CHECK-NEXT: .LBB15_1: # %loop.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: addq $8, %rcx -; CHECK-NEXT: cmpq %rdx, %rcx +; CHECK-NEXT: addq $8, %r8 +; CHECK-NEXT: cmpq %rdx, %r8 ; CHECK-NEXT: ja .LBB15_3 ; CHECK-NEXT: # %bb.2: # %loop.body ; CHECK-NEXT: # in Loop: Header=BB15_1 Depth=1 -; CHECK-NEXT: movq (%r8), %rcx +; CHECK-NEXT: movq (%rax), %r8 ; CHECK-NEXT: .LBB15_3: # %loop.body ; CHECK-NEXT: # in Loop: Header=BB15_1 Depth=1 -; CHECK-NEXT: movl %edi, (%rcx) -; CHECK-NEXT: addq $8, %rcx -; CHECK-NEXT: cmpq %rdx, %rcx +; CHECK-NEXT: movl %edi, (%r8) +; CHECK-NEXT: addq $8, %r8 +; CHECK-NEXT: cmpq %rdx, %r8 ; CHECK-NEXT: ja .LBB15_5 ; CHECK-NEXT: # %bb.4: # %loop.body ; CHECK-NEXT: # in Loop: Header=BB15_1 Depth=1 -; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: movq %rcx, %r8 ; CHECK-NEXT: .LBB15_5: # %loop.body ; CHECK-NEXT: # in Loop: Header=BB15_1 Depth=1 -; CHECK-NEXT: movl %edi, (%rcx) +; CHECK-NEXT: movl %edi, (%r8) ; CHECK-NEXT: addl $1, %esi ; CHECK-NEXT: cmpl $1024, %esi # imm = 0x400 ; CHECK-NEXT: jl .LBB15_1 @@ -1003,32 +1003,32 @@ ; ; CHECK-FORCEALL-LABEL: test_memoperand_loop: ; CHECK-FORCEALL: # %bb.0: # %entry -; CHECK-FORCEALL-NEXT: movq begin@GOTPCREL(%rip), %r8 -; CHECK-FORCEALL-NEXT: movq (%r8), %rax -; CHECK-FORCEALL-NEXT: movq end@GOTPCREL(%rip), %rcx -; CHECK-FORCEALL-NEXT: movq (%rcx), %rdx +; CHECK-FORCEALL-NEXT: movq begin@GOTPCREL(%rip), %rax +; CHECK-FORCEALL-NEXT: movq (%rax), %rcx +; CHECK-FORCEALL-NEXT: movq end@GOTPCREL(%rip), %rdx +; CHECK-FORCEALL-NEXT: movq (%rdx), %rdx ; CHECK-FORCEALL-NEXT: xorl %esi, %esi -; CHECK-FORCEALL-NEXT: movq %rax, %rcx +; CHECK-FORCEALL-NEXT: movq %rcx, %r8 ; CHECK-FORCEALL-NEXT: .LBB15_1: # %loop.body ; CHECK-FORCEALL-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-FORCEALL-NEXT: addq $8, %rcx -; CHECK-FORCEALL-NEXT: cmpq %rdx, %rcx +; CHECK-FORCEALL-NEXT: addq $8, %r8 +; CHECK-FORCEALL-NEXT: cmpq %rdx, %r8 ; CHECK-FORCEALL-NEXT: ja .LBB15_3 ; CHECK-FORCEALL-NEXT: # %bb.2: # %loop.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB15_1 Depth=1 -; CHECK-FORCEALL-NEXT: movq (%r8), %rcx +; CHECK-FORCEALL-NEXT: movq (%rax), %r8 ; CHECK-FORCEALL-NEXT: .LBB15_3: # %loop.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB15_1 Depth=1 -; CHECK-FORCEALL-NEXT: movl %edi, (%rcx) -; CHECK-FORCEALL-NEXT: addq $8, %rcx -; CHECK-FORCEALL-NEXT: cmpq %rdx, %rcx +; CHECK-FORCEALL-NEXT: movl %edi, (%r8) +; CHECK-FORCEALL-NEXT: addq $8, %r8 +; CHECK-FORCEALL-NEXT: cmpq %rdx, %r8 ; CHECK-FORCEALL-NEXT: ja .LBB15_5 ; CHECK-FORCEALL-NEXT: # %bb.4: # %loop.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB15_1 Depth=1 -; CHECK-FORCEALL-NEXT: movq %rax, %rcx +; CHECK-FORCEALL-NEXT: movq %rcx, %r8 ; CHECK-FORCEALL-NEXT: .LBB15_5: # %loop.body ; CHECK-FORCEALL-NEXT: # in Loop: Header=BB15_1 Depth=1 -; CHECK-FORCEALL-NEXT: movl %edi, (%rcx) +; CHECK-FORCEALL-NEXT: movl %edi, (%r8) ; CHECK-FORCEALL-NEXT: addl $1, %esi ; CHECK-FORCEALL-NEXT: cmpl $1024, %esi # imm = 0x400 ; CHECK-FORCEALL-NEXT: jl .LBB15_1 diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -249,33 +249,33 @@ define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32 x i8> %x3, <32 x i8> %x4, ptr %p) nounwind { ; AVX1-LABEL: interleaved_store_vf32_i8_stride4: ; AVX1: # %bb.0: -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, 96(%rdi) +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-NEXT: vmovaps %ymm1, 96(%rdi) ; AVX1-NEXT: vmovaps %ymm3, 64(%rdi) -; AVX1-NEXT: vmovaps %ymm1, 32(%rdi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) ; AVX1-NEXT: vmovaps %ymm2, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -529,7 +529,7 @@ ; AVX512-LABEL: interleaved_load_vf16_i8_stride4: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm8 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm3 @@ -548,21 +548,21 @@ ; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm6 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm0 +; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm8 ; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX512-NEXT: vpcmpeqb %zmm5, %zmm8, %k0 -; AVX512-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 +; AVX512-NEXT: vpcmpeqb %zmm5, %zmm0, %k0 +; AVX512-NEXT: vpcmpeqb %zmm1, %zmm6, %k1 ; AVX512-NEXT: kxnorw %k1, %k0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -585,84 +585,82 @@ ; AVX1-LABEL: interleaved_load_vf32_i8_stride4: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX1-NEXT: vmovdqa (%rdi), %xmm10 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm13 -; AVX1-NEXT: vpshufb %xmm6, %xmm13, %xmm4 -; AVX1-NEXT: vpshufb %xmm6, %xmm12, %xmm5 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm4 +; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX1-NEXT: vpshufb %xmm0, %xmm10, %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm5 +; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm7 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vmovdqa 112(%rdi), %xmm14 -; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm7 +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm7 ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm5 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm6 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm6 -; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX1-NEXT: vpshufb %xmm8, %xmm6, %xmm11 ; AVX1-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-NEXT: vpshufb %xmm1, %xmm13, %xmm2 -; AVX1-NEXT: vpshufb %xmm1, %xmm12, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm10, %xmm4 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpcmpeqb %xmm0, %xmm8, %xmm0 -; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpcmpeqb %xmm0, %xmm9, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm2 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm10, %xmm4 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm8 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2,3],xmm10[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm8 +; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm12 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13 +; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2,3],xmm8[4,5,6,7] +; AVX1-NEXT: vpcmpeqb %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm9 +; AVX1-NEXT: vpshufb %xmm11, %xmm5, %xmm11 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX1-NEXT: vpshufb %xmm12, %xmm6, %xmm11 +; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm12 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7] +; AVX1-NEXT: vpcmpeqb %xmm9, %xmm10, %xmm9 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-NEXT: vpshufb %xmm10, %xmm3, %xmm11 +; AVX1-NEXT: vpshufb %xmm10, %xmm2, %xmm12 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13 +; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm10, %xmm4, %xmm13 +; AVX1-NEXT: vpshufb %xmm10, %xmm5, %xmm10 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX1-NEXT: vpshufb %xmm12, %xmm6, %xmm13 +; AVX1-NEXT: vpshufb %xmm12, %xmm7, %xmm12 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm12, %xmm4 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm0 -; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm1 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpcmpeqb %xmm0, %xmm8, %xmm0 -; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpcmpeqb %xmm0, %xmm11, %xmm0 +; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm1 +; AVX1-NEXT: vpshufb %xmm12, %xmm5, %xmm2 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm2 -; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm3 +; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm15, %xmm1 -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm10, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -670,8 +668,8 @@ ; ; AVX2-LABEL: interleaved_load_vf32_i8_stride4: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4 @@ -681,69 +679,69 @@ ; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm0 +; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8 ; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %ymm7, %ymm10, %ymm9 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm9 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184] ; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7 -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm0 -; AVX2-NEXT: vpshufb %xmm7, %xmm4, %xmm7 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm1 -; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %ymm1, %ymm10, %ymm7 -; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7 -; AVX2-NEXT: vpshufb %ymm1, %ymm8, %ymm1 -; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpcmpeqb %ymm0, %ymm9, %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm1, %xmm3, %xmm7 -; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %ymm1, %ymm10, %ymm7 -; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7 -; AVX2-NEXT: vpshufb %ymm1, %ymm8, %ymm1 -; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm8 +; AVX2-NEXT: vpermd %ymm8, %ymm6, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9 +; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 +; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10 +; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 +; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9 +; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vpcmpeqb %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9 +; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 +; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10 +; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 +; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9 +; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %ymm2, %ymm10, %ymm3 -; AVX2-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm2 -; AVX2-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm0, %ymm9, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm7, %ymm0 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -1008,36 +1006,36 @@ ; AVX1-LABEL: interleaved_store_vf32_i8_stride3: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm4[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 +; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqu %xmm3, 80(%rdi) -; AVX1-NEXT: vmovdqu %xmm0, 64(%rdi) -; AVX1-NEXT: vmovdqu %xmm5, 48(%rdi) +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqu %xmm5, 80(%rdi) +; AVX1-NEXT: vmovdqu %xmm3, 64(%rdi) +; AVX1-NEXT: vmovdqu %xmm4, 48(%rdi) ; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi) -; AVX1-NEXT: vmovdqu %xmm6, 16(%rdi) +; AVX1-NEXT: vmovdqu %xmm0, 16(%rdi) ; AVX1-NEXT: vmovdqu %xmm1, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1097,118 +1095,109 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, ptr %p) nounwind { ; AVX1-LABEL: interleaved_store_vf64_i8_stride3: ; AVX1: # %bb.0: -; AVX1-NEXT: subq $88, %rsp +; AVX1-NEXT: pushq %rax ; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa %ymm3, %ymm11 -; AVX1-NEXT: vmovdqa %ymm2, %ymm12 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm13 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-NEXT: vpshufb %xmm5, %xmm13, %xmm8 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm15 -; AVX1-NEXT: vpshufb %xmm2, %xmm15, %xmm6 -; AVX1-NEXT: vpor %xmm6, %xmm8, %xmm3 -; AVX1-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm9 -; AVX1-NEXT: vpshufb %xmm2, %xmm11, %xmm6 -; AVX1-NEXT: vpor %xmm6, %xmm9, %xmm3 -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm5, %xmm10, %xmm14 -; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm6 -; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm7 -; AVX1-NEXT: vpor %xmm7, %xmm14, %xmm3 -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm7, %xmm10, %xmm10 -; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm3 -; AVX1-NEXT: vpshufb %xmm7, %xmm13, %xmm13 -; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm4 -; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm2, %xmm12, %xmm7 -; AVX1-NEXT: vpor %xmm0, %xmm7, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm2 +; AVX1-NEXT: vmovdqa %ymm4, %ymm5 +; AVX1-NEXT: vmovdqa %ymm2, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm12 +; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm7 +; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm2 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vmovdqa %ymm1, %ymm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX1-NEXT: vpshufb %xmm0, %xmm8, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm11[8],xmm8[9],xmm11[9],xmm8[10],xmm11[10],xmm8[11],xmm11[11],xmm8[12],xmm11[12],xmm8[13],xmm11[13],xmm8[14],xmm11[14],xmm8[15],xmm11[15] -; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm5, %xmm13, %xmm5 -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] -; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4] -; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm6 -; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4] +; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm7 +; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm13 +; AVX1-NEXT: vpor %xmm7, %xmm13, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm10, %xmm8, %xmm13 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm14 +; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm15 +; AVX1-NEXT: vpor %xmm13, %xmm15, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm2 +; AVX1-NEXT: vpshufb %xmm15, %xmm9, %xmm9 +; AVX1-NEXT: vpshufb %xmm15, %xmm0, %xmm15 +; AVX1-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm10 +; AVX1-NEXT: vpor %xmm0, %xmm10, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm9 # 16-byte Folded Reload -; AVX1-NEXT: # xmm9 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4] -; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload -; AVX1-NEXT: # xmm3 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] -; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm11 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm10 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm7 -; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm6 -; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm12 -; AVX1-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-NEXT: vpshufb %xmm1, %xmm14, %xmm4 -; AVX1-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm1, %xmm15, %xmm7 -; AVX1-NEXT: vpshufb %xmm1, %xmm9, %xmm2 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm6 -; AVX1-NEXT: vpshufb %xmm1, %xmm13, %xmm8 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm9 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vmovdqu %xmm2, 80(%rdi) -; AVX1-NEXT: vmovdqu %xmm10, 64(%rdi) -; AVX1-NEXT: vmovdqu %xmm11, 16(%rdi) +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-NEXT: vpshufb %xmm11, %xmm10, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; AVX1-NEXT: vpshufb %xmm11, %xmm7, %xmm10 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm13 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] +; AVX1-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm11, %xmm9, %xmm11 +; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] +; AVX1-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm2, %xmm12, %xmm12 +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX1-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm12, %xmm8, %xmm12 +; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] +; AVX1-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] +; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] +; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload +; AVX1-NEXT: # xmm8 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] +; AVX1-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX1-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX1-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX1-NEXT: vpshufb %xmm13, %xmm7, %xmm14 +; AVX1-NEXT: vpor %xmm14, %xmm10, %xmm10 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm9 +; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX1-NEXT: vpshufb %xmm9, %xmm12, %xmm12 +; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm7 +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm2 +; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vmovdqu %xmm7, 80(%rdi) +; AVX1-NEXT: vmovdqu %xmm0, 64(%rdi) +; AVX1-NEXT: vmovdqu %xmm5, 16(%rdi) ; AVX1-NEXT: vmovdqu %xmm4, (%rdi) -; AVX1-NEXT: vmovdqu %xmm7, 48(%rdi) -; AVX1-NEXT: vmovdqu %xmm3, 32(%rdi) -; AVX1-NEXT: vmovdqu %xmm1, 176(%rdi) -; AVX1-NEXT: vmovdqu %xmm0, 160(%rdi) -; AVX1-NEXT: vmovdqu %xmm12, 112(%rdi) -; AVX1-NEXT: vmovdqu %xmm6, 96(%rdi) -; AVX1-NEXT: vmovdqu %xmm9, 144(%rdi) -; AVX1-NEXT: vmovdqu %xmm8, 128(%rdi) -; AVX1-NEXT: addq $88, %rsp +; AVX1-NEXT: vmovdqu %xmm12, 48(%rdi) +; AVX1-NEXT: vmovdqu %xmm8, 32(%rdi) +; AVX1-NEXT: vmovdqu %xmm6, 176(%rdi) +; AVX1-NEXT: vmovdqu %xmm1, 160(%rdi) +; AVX1-NEXT: vmovdqu %xmm10, 112(%rdi) +; AVX1-NEXT: vmovdqu %xmm3, 96(%rdi) +; AVX1-NEXT: vmovdqu %xmm11, 144(%rdi) +; AVX1-NEXT: vmovdqu %xmm2, 128(%rdi) +; AVX1-NEXT: popq %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1307,125 +1296,116 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX1-LABEL: interleaved_load_vf64_i8_stride3: ; AVX1: # %bb.0: -; AVX1-NEXT: subq $40, %rsp -; AVX1-NEXT: .cfi_def_cfa_offset 48 -; AVX1-NEXT: vmovdqu (%rdi), %xmm9 -; AVX1-NEXT: vmovdqu 16(%rdi), %xmm11 +; AVX1-NEXT: vmovdqu (%rdi), %xmm8 +; AVX1-NEXT: vmovups 16(%rdi), %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovdqu 48(%rdi), %xmm10 -; AVX1-NEXT: vmovdqu 64(%rdi), %xmm15 -; AVX1-NEXT: vmovdqu 80(%rdi), %xmm14 -; AVX1-NEXT: vmovdqu 96(%rdi), %xmm3 -; AVX1-NEXT: vmovdqu 112(%rdi), %xmm1 -; AVX1-NEXT: vmovdqu 144(%rdi), %xmm6 -; AVX1-NEXT: vmovdqu 160(%rdi), %xmm12 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm0 +; AVX1-NEXT: vmovdqu 64(%rdi), %xmm3 +; AVX1-NEXT: vmovdqu 80(%rdi), %xmm4 +; AVX1-NEXT: vmovdqu 96(%rdi), %xmm5 +; AVX1-NEXT: vmovdqu 112(%rdi), %xmm2 +; AVX1-NEXT: vmovdqu 144(%rdi), %xmm12 +; AVX1-NEXT: vmovdqu 160(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] +; AVX1-NEXT: vpshufb %xmm11, %xmm5, %xmm6 +; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm7 +; AVX1-NEXT: vpshufb %xmm11, %xmm8, %xmm9 +; AVX1-NEXT: vpshufb %xmm11, %xmm10, %xmm11 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm15 +; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpor %xmm5, %xmm15, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm13 -; AVX1-NEXT: vpshufb %xmm2, %xmm9, %xmm5 -; AVX1-NEXT: vpshufb %xmm2, %xmm10, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm7 -; AVX1-NEXT: vmovdqa %xmm1, %xmm2 -; AVX1-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm1 +; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm15 +; AVX1-NEXT: vmovdqa %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vpshufb %xmm8, %xmm12, %xmm7 -; AVX1-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm1 +; AVX1-NEXT: vpor %xmm12, %xmm15, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm7 -; AVX1-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm8, %xmm11, %xmm3 -; AVX1-NEXT: vpor %xmm7, %xmm3, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm0, %xmm10, %xmm1 -; AVX1-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm8, %xmm15, %xmm7 -; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm7 -; AVX1-NEXT: vpshufb %xmm8, %xmm14, %xmm6 -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm15 -; AVX1-NEXT: vmovdqu 32(%rdi), %xmm7 -; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm9 -; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm10 -; AVX1-NEXT: vpor %xmm9, %xmm10, %xmm10 -; AVX1-NEXT: vmovdqu 176(%rdi), %xmm9 -; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX1-NEXT: vpshufb %xmm8, %xmm9, %xmm11 -; AVX1-NEXT: vpor %xmm1, %xmm11, %xmm11 -; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqu 128(%rdi), %xmm1 -; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm3 -; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm15 +; AVX1-NEXT: vpor %xmm8, %xmm15, %xmm5 +; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm13, %xmm10, %xmm8 +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm10 +; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm10 +; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm8 +; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm5 +; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm8 +; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm14, %xmm8, %xmm12 +; AVX1-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX1-NEXT: vmovdqu 176(%rdi), %xmm12 +; AVX1-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb %xmm14, %xmm12, %xmm15 +; AVX1-NEXT: vpor %xmm1, %xmm15, %xmm1 +; AVX1-NEXT: vpshufb %xmm13, %xmm2, %xmm13 +; AVX1-NEXT: vmovdqu 128(%rdi), %xmm15 +; AVX1-NEXT: vpshufb %xmm14, %xmm15, %xmm14 +; AVX1-NEXT: vpor %xmm13, %xmm14, %xmm14 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm6 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm6 -; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm2[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm6 -; AVX1-NEXT: vpor %xmm6, %xmm13, %xmm12 -; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm11 -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm10 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] -; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm4 -; AVX1-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm4 -; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm6, %xmm9, %xmm5 -; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddb %xmm4, %xmm13, %xmm4 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm8, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] -; AVX1-NEXT: vpshufb %xmm3, %xmm10, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpshufb %xmm3, %xmm11, %xmm5 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm13 +; AVX1-NEXT: vpor %xmm6, %xmm13, %xmm13 +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm14 +; AVX1-NEXT: vpor %xmm7, %xmm14, %xmm14 +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb %xmm0, %xmm8, %xmm7 +; AVX1-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm11[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] +; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX1-NEXT: vpor %xmm4, %xmm10, %xmm4 +; AVX1-NEXT: vpaddb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5 +; AVX1-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm12, %xmm5 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5 +; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm8 +; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpaddb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5 +; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm8 +; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] +; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm3 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm3 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpshufb %xmm6, %xmm13, %xmm3 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: addq $40, %rsp -; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleaved_load_vf64_i8_stride3: @@ -1538,54 +1518,54 @@ ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm11 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm13 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm13 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX1-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm10, %ymm4 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm8 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm14, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 ; AVX1-NEXT: vmovaps %ymm3, 224(%rdi) ; AVX1-NEXT: vmovaps %ymm0, 192(%rdi) -; AVX1-NEXT: vmovaps %ymm7, 160(%rdi) +; AVX1-NEXT: vmovaps %ymm1, 160(%rdi) ; AVX1-NEXT: vmovaps %ymm6, 128(%rdi) -; AVX1-NEXT: vmovaps %ymm1, 96(%rdi) -; AVX1-NEXT: vmovaps %ymm2, 64(%rdi) +; AVX1-NEXT: vmovaps %ymm2, 96(%rdi) +; AVX1-NEXT: vmovaps %ymm9, 64(%rdi) ; AVX1-NEXT: vmovaps %ymm4, 32(%rdi) ; AVX1-NEXT: vmovaps %ymm8, (%rdi) ; AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/znver3-gather.ll b/llvm/test/CodeGen/X86/znver3-gather.ll --- a/llvm/test/CodeGen/X86/znver3-gather.ll +++ b/llvm/test/CodeGen/X86/znver3-gather.ll @@ -11,26 +11,26 @@ ; X64-NEXT: vpmovsxdq %xmm2, %ymm2 ; X64-NEXT: vpsllq $2, %ymm0, %ymm0 ; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; X64-NEXT: vmovq %xmm0, %r8 -; X64-NEXT: vpextrq $1, %xmm0, %r9 +; X64-NEXT: vmovq %xmm0, %rax +; X64-NEXT: vpextrq $1, %xmm0, %rcx ; X64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; X64-NEXT: vpsllq $2, %ymm2, %ymm2 ; X64-NEXT: vpaddq %ymm2, %ymm1, %ymm2 ; X64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: vpextrq $1, %xmm0, %r10 +; X64-NEXT: vpextrq $1, %xmm0, %rdx ; X64-NEXT: vmovq %xmm0, %rsi ; X64-NEXT: vextracti128 $1, %ymm2, %xmm0 ; X64-NEXT: vmovq %xmm2, %rdi -; X64-NEXT: vpextrq $1, %xmm2, %rax -; X64-NEXT: vpinsrd $1, (%r9), %xmm1, %xmm1 -; X64-NEXT: vmovq %xmm0, %rcx -; X64-NEXT: vpextrq $1, %xmm0, %rdx +; X64-NEXT: vpextrq $1, %xmm2, %r8 +; X64-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm1 +; X64-NEXT: vmovq %xmm0, %r9 +; X64-NEXT: vpextrq $1, %xmm0, %r10 ; X64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: vpinsrd $2, (%rsi), %xmm1, %xmm1 -; X64-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0 -; X64-NEXT: vpinsrd $3, (%r10), %xmm1, %xmm1 -; X64-NEXT: vpinsrd $2, (%rcx), %xmm0, %xmm0 -; X64-NEXT: vpinsrd $3, (%rdx), %xmm0, %xmm0 +; X64-NEXT: vpinsrd $1, (%r8), %xmm0, %xmm0 +; X64-NEXT: vpinsrd $3, (%rdx), %xmm1, %xmm1 +; X64-NEXT: vpinsrd $2, (%r9), %xmm0, %xmm0 +; X64-NEXT: vpinsrd $3, (%r10), %xmm0, %xmm0 ; X64-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq %ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %offsets @@ -48,26 +48,26 @@ ; X64-NEXT: vpmovsxdq %xmm2, %ymm2 ; X64-NEXT: vpsllq $2, %ymm0, %ymm0 ; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; X64-NEXT: vmovq %xmm0, %r8 -; X64-NEXT: vpextrq $1, %xmm0, %r9 +; X64-NEXT: vmovq %xmm0, %rax +; X64-NEXT: vpextrq $1, %xmm0, %rcx ; X64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; X64-NEXT: vpsllq $2, %ymm2, %ymm2 ; X64-NEXT: vpaddq %ymm2, %ymm1, %ymm2 ; X64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: vpextrq $1, %xmm0, %r10 +; X64-NEXT: vpextrq $1, %xmm0, %rdx ; X64-NEXT: vmovq %xmm0, %rsi ; X64-NEXT: vextracti128 $1, %ymm2, %xmm0 ; X64-NEXT: vmovq %xmm2, %rdi -; X64-NEXT: vpextrq $1, %xmm2, %rax -; X64-NEXT: vpinsrd $1, (%r9), %xmm1, %xmm1 -; X64-NEXT: vmovq %xmm0, %rcx -; X64-NEXT: vpextrq $1, %xmm0, %rdx +; X64-NEXT: vpextrq $1, %xmm2, %r8 +; X64-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm1 +; X64-NEXT: vmovq %xmm0, %r9 +; X64-NEXT: vpextrq $1, %xmm0, %r10 ; X64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: vpinsrd $2, (%rsi), %xmm1, %xmm1 -; X64-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0 -; X64-NEXT: vpinsrd $3, (%r10), %xmm1, %xmm1 -; X64-NEXT: vpinsrd $2, (%rcx), %xmm0, %xmm0 -; X64-NEXT: vpinsrd $3, (%rdx), %xmm0, %xmm0 +; X64-NEXT: vpinsrd $1, (%r8), %xmm0, %xmm0 +; X64-NEXT: vpinsrd $3, (%rdx), %xmm1, %xmm1 +; X64-NEXT: vpinsrd $2, (%r9), %xmm0, %xmm0 +; X64-NEXT: vpinsrd $3, (%r10), %xmm0, %xmm0 ; X64-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq %ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %offsets @@ -85,26 +85,26 @@ ; X64-NEXT: vpmovsxdq %xmm2, %ymm2 ; X64-NEXT: vpsllq $2, %ymm0, %ymm0 ; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; X64-NEXT: vmovq %xmm0, %r8 -; X64-NEXT: vpextrq $1, %xmm0, %r9 +; X64-NEXT: vmovq %xmm0, %rax +; X64-NEXT: vpextrq $1, %xmm0, %rcx ; X64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; X64-NEXT: vpsllq $2, %ymm2, %ymm2 ; X64-NEXT: vpaddq %ymm2, %ymm1, %ymm2 ; X64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: vpextrq $1, %xmm0, %r10 +; X64-NEXT: vpextrq $1, %xmm0, %rdx ; X64-NEXT: vmovq %xmm0, %rsi ; X64-NEXT: vextracti128 $1, %ymm2, %xmm0 ; X64-NEXT: vmovq %xmm2, %rdi -; X64-NEXT: vpextrq $1, %xmm2, %rax -; X64-NEXT: vpinsrd $1, (%r9), %xmm1, %xmm1 -; X64-NEXT: vmovq %xmm0, %rcx -; X64-NEXT: vpextrq $1, %xmm0, %rdx +; X64-NEXT: vpextrq $1, %xmm2, %r8 +; X64-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm1 +; X64-NEXT: vmovq %xmm0, %r9 +; X64-NEXT: vpextrq $1, %xmm0, %r10 ; X64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: vpinsrd $2, (%rsi), %xmm1, %xmm1 -; X64-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0 -; X64-NEXT: vpinsrd $3, (%r10), %xmm1, %xmm1 -; X64-NEXT: vpinsrd $2, (%rcx), %xmm0, %xmm0 -; X64-NEXT: vpinsrd $3, (%rdx), %xmm0, %xmm0 +; X64-NEXT: vpinsrd $1, (%r8), %xmm0, %xmm0 +; X64-NEXT: vpinsrd $3, (%rdx), %xmm1, %xmm1 +; X64-NEXT: vpinsrd $2, (%r9), %xmm0, %xmm0 +; X64-NEXT: vpinsrd $3, (%r10), %xmm0, %xmm0 ; X64-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq %ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %offsets